{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "95444cd5-1f59-4891-8450-d0c91c0bd642",
   "metadata": {},
   "source": [
    "# 所有zotero中的条目信息"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "520b9ddf-87d5-4df0-b136-b30e015f23f1",
   "metadata": {},
   "source": [
    "## 先加载所有信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "93f3aecf-8e33-4251-9fef-991832d99372",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:16:11.384373Z",
     "start_time": "2025-07-25T14:15:35.465447Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total items: 1183\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pyzotero import zotero\n",
    "\n",
    "library_id = '11562465'\n",
    "library_type = 'user'\n",
    "api_key = '0bQwmW6asXU18FIOvFbu2YHN'\n",
    "locale = 'zh-CN'\n",
    "\n",
    "# 实例化 Zotero 对象\n",
    "zot = zotero.Zotero(library_id, library_type, api_key, locale)\n",
    "\n",
    "# 初始化一个列表来存储所有条目\n",
    "all_items = []\n",
    "\n",
    "# 初始化分页参数\n",
    "start = 0\n",
    "items_per_page = 100\n",
    "\n",
    "# 循环获取所有数据\n",
    "while True:\n",
    "    # 获取当前页的数据\n",
    "    items = zot.items(limit=items_per_page, start=start)\n",
    "    if not items:\n",
    "        break  # 如果没有更多数据，退出循环\n",
    "    all_items.extend(items)  # 将当前页的数据添加到列表中\n",
    "    start += items_per_page  # 更新分页参数\n",
    "\n",
    "# 打印所有条目\n",
    "# print(json.dumps(all_items, indent=4, ensure_ascii=False))\n",
    "\n",
    "# 打印条目数量\n",
    "print(f\"Total items: {len(all_items)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b4e5aee-4892-44a1-bc53-23097a24a32b",
   "metadata": {},
   "source": [
    "## 三种类别的文章信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5b00355b-f462-4502-881a-b01434d3797b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:18:06.642759Z",
     "start_time": "2025-07-25T14:18:06.621761Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 要保存的数据列表\n",
    "journalArticle_data = []\n",
    "for item in all_items:\n",
    "    # 只提取期刊文章，也就是论文的pdf\n",
    "    if 'itemType' in item['data'] and item['data']['itemType'] == 'journalArticle':\n",
    "        tags = []\n",
    "        for tag in item['data']['tags']:\n",
    "            tags.append(tag['tag'])\n",
    "        if(item['data']['creators']):\n",
    "            creator = item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName']\n",
    "        else:\n",
    "            creator = 'Unknown Author'\n",
    "        journalArticle_data.append({\n",
    "            'title': item['data']['title'],\n",
    "            'creators': creator,\n",
    "            'abstractNote': item['data']['abstractNote'].replace(',', '，'),\n",
    "            'publicationTitle': item['data']['publicationTitle'],\n",
    "            'date': item['data']['date'],\n",
    "            'language': item['data']['language'],\n",
    "            'url': item['data']['url'],\n",
    "            'libraryCatalog': item['data']['libraryCatalog'],\n",
    "            'tags': tags\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1dff17ffe1ed7fe5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:18:07.832981Z",
     "start_time": "2025-07-25T14:18:07.807015Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'title': 'A novel framework on intelligent detection for module defects of PV plant combining the visible and infrared images一种结合可见光和红外图像的光伏电站组件缺陷智能检测新框架',\n",
       "  'creators': 'HongFeng',\n",
       "  'abstractNote': 'Solar Photovoltaic (PV) industry has achieved rapid development in recent years. However， it is difficult and costly to detect the micro fault area in a large PV power plant due to environmental factors and missing data. Most faults can be detected by the infrared temperature measurement method， but the infrared camera characteristics constrain it. This paper proposed a novel framework， consisting of image acquirement， image segmentation， fault orientation and defect warning， to remedy the limitations for PV module defects. The visible and infrared PV array images are taken under the same conditions by a dual infrared camera at low altitudes. The deep learning methods， including the fifth version of You Only Look Once (YOLOv5) algorithm and Deep Residual Network (ResNet) algorithm， are introduced to this framework. Hence， this framework has strong capability to suit almost all brightness conditions， by the combination of image segmentation from visible images and fault location on infrared images. The results show that this framework dramatically improves the separation speed of photovoltaic array to 36 Fps and the accuracy of fault detection to 95% by infrared image marked with the segmented area.',\n",
       "  'publicationTitle': 'Solar Energy',\n",
       "  'date': '2022-04-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X22001840',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Image fusion', 'Module defect', 'PV plant', 'ResNet', 'YOLOv5']},\n",
       " {'title': 'Automatic detection of photovoltaic module defects in infrared images with isolated and develop-model transfer deep learning通过隔离和开发模型迁移深度学习自动检测红外图像中的光伏组件缺陷',\n",
       "  'creators': 'AkramM. Waqar',\n",
       "  'abstractNote': 'With the rising use of photovoltaic and ongoing installation of large-scale photovoltaic systems worldwide， the automation of photovoltaic monitoring methods becomes important， as manual/visual inspection has limited applications. This research work deals with automatic detection of photovoltaic module defects in Infrared images with isolated deep learning and develop-model transfer deep learning techniques. An Infrared images dataset containing infrared images of normal operating and defective modules is collected and used to train the networks. The dataset is obtained from Infrared imaging performed on normal operating and defective photovoltaic modules with lab induced defects. An isolated learned model is trained from scratch using a light convolutional neural network design that achieved an average accuracy of 98.67%. For transfer learning， a base model is first developed (pre-trained) from electroluminescence images dataset of photovoltaic cells and then fine-tuned on infrared images dataset， that achieved an average accuracy of 99.23%. Both frameworks require low computation power and less time; and can be implemented with ordinary hardware. They also maintained real time prediction speed. The comparison shows that the develop-model transfer learning technique can help to improve the performance. In addition， we reviewed different kind of defects detectable from infrared imaging of photovoltaic modules， that can help in manual labelling for identifying different defect categories upon access to new huge data in future studies. Last of all， the presented frameworks are applied for experimental testing and qualitative evaluation.',\n",
       "  'publicationTitle': 'Solar Energy',\n",
       "  'date': '2020-03-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X20300621',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Automatic defect detection',\n",
       "   'Develop-model transfer deep learning',\n",
       "   'Infrared images',\n",
       "   'Isolated deep learning',\n",
       "   'Photovoltaic (PV) modules',\n",
       "   'Thermography']},\n",
       " {'title': 'DAF-DETR: A dynamic adaptation feature transformer for enhanced object detection in unmanned aerial vehicles',\n",
       "  'creators': 'SongBaoye',\n",
       "  'abstractNote': 'Object detection in complex environments is challenged by overlapping objects， complex spatial relationships， and dynamic variations in target scales. To address these challenges， the Dynamic Adaptation Feature DEtection TRansformer (DAF-DETR) is proposed as a novel transformer-based model optimized for real-time detection in spatially complex environments. The framework introduces four key innovations. First， a learnable position encoding mechanism is employed in place of fixed positional encoding， enhancing adaptability and flexibility when processing complex spatial layouts. Second， the Resynthetic Network (ResynNet) backbone， which consists of stacked Resynthetic Blocks (ResynBlocks) integrating ResBlock and FasterBlock feature extraction strategies， is designed to optimize multi-scale feature representation and improve computational efficiency. Third， an enhanced feature fusion module is incorporated to strengthen the detection of small， densely packed objects by integrating multi-scale contextual information. Fourth， a dynamic perception module is introduced， utilizing deformable attention to capture complex spatial relationships between overlapping objects. Extensive experiments conducted on the Vision meets Drone 2019 (VisDrone2019) and Tiny Object Detection in Aerial Images (AI-TOD) datasets demonstrate the superiority of DAF-DETR， achieving state-of-the-art detection accuracy while maintaining real-time efficiency. The results confirm its robustness in handling scale variations， occlusions， and spatial complexity， establishing it as a reliable solution for real-world applications such as aerial imagery and crowded scene analysis.',\n",
       "  'publicationTitle': 'Knowledge-Based Systems',\n",
       "  'date': '07/2025',\n",
       "  'language': 'en',\n",
       "  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0950705125008068',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Fast fault detection method for photovoltaic arrays with adaptive deep multiscale feature enhancement',\n",
       "  'creators': 'GongBin',\n",
       "  'abstractNote': 'Photovoltaic (PV) arrays have output characteristics such as randomness and intermittency， and faults can seriously affect the safe operation of the power system. In order to improve the comprehensive performance of the PV array fault diagnosis model， a new intelligent online fault monitoring method for PV arrays is proposed in this paper. (1) a three-dimensional channel feature map based on I， V， and P features is constructed because the IV and P curves of the PV array have significantly different effects under different fault conditions. (2) The PV array fault diagnosis model based on a multi-source information fusion network (MIFNet) is proposed， and Channel Mixing Convolution (CMC) module， three-dimensional feature attention enhancement (TDFAE) module， and Channel normalized scaling (CNS) module are designed to improve the comprehensive performance of the model. (3) An adaptive nonlinear mutual sparrow search algorithm (ANMSSA) is proposed to optimize the hyperparameter configuration of the MIFNet network. The experimental results show that the average recognition accuracy， prediction accuracy， and sensitivity of the ANMSSA-MIFNet network proposed in this paper are 99.64%， 99.64%， and 99.71% respectively. When facing single-component faults and multi-component faults， the model has stronger diagnostic accuracy， robustness， anti-noise ability， and stability， and can efficiently diagnose different faults of PV arrays， providing the scientific basis and theoretical support for the operation of PV systems.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '01/2024',\n",
       "  'language': 'en',\n",
       "  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0306261923014356',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Photovoltaic Cell Defect Detection by Lock-In Thermography Using 2-D Gaussian Profile',\n",
       "  'creators': 'VieiraThiago Mota',\n",
       "  'abstractNote': 'The electrical energy produced by photovoltaic systems can be critically affected by a variety of factors. In order to detect defective photovoltaic cells， several monitoring techniques， such as lock-in thermography， have been widely used alongside some analytical methods that avoid subjectivity. This article proposes a method with low computational cost that provides a simple and easily implementable way to quantifiably discern if a photovoltaic cell is defective or not. A two-dimensional Gaussian fit is applied to images generated by fast Fourier transform and principal component analysis algorithms on thermographic data from lock-in thermography tests. The considered coefficient of determination R^2 was found to be a good measure of fitting quality. Additionally， the method highlighted the potential of its application on first principal component， with R^2 between 0.944 and 0.986， and magnitude images， with R^2 between 0.965 and 0.985， in order to identify and distinguish nondefective cells from defective ones.',\n",
       "  'publicationTitle': 'IEEE Journal of Photovoltaics',\n",
       "  'date': '2024-05',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10438520',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Coefficient of determination',\n",
       "   'Fast Fourier transforms',\n",
       "   'Gaussian processes',\n",
       "   'Photovoltaic cells',\n",
       "   'Photovoltaic systems',\n",
       "   'Principal component analysis',\n",
       "   'Surface cracks',\n",
       "   'lock-in thermography (LIT)',\n",
       "   'photovoltaic (PV) cell',\n",
       "   'two-dimensional (2-D) Gaussian fit']},\n",
       " {'title': 'Faults Detection for Photovoltaic Field Based on K-Means, Elbow, and Average Silhouette Techniques through the Segmentation of a Thermal Image',\n",
       "  'creators': 'Et-talebyAbdelilah',\n",
       "  'abstractNote': 'Clustering or grouping is among the most important image processing methods that aim to split an image into different groups. Examining the literature， many clustering algorithms have been carried ou...',\n",
       "  'publicationTitle': 'International Journal of Photoenergy',\n",
       "  'date': '2020/01/01',\n",
       "  'language': 'en',\n",
       "  'url': 'https://onlinelibrary.wiley.com/doi/10.1155/2020/6617597',\n",
       "  'libraryCatalog': 'onlinelibrary.wiley.com',\n",
       "  'tags': []},\n",
       " {'title': 'Thermal Infrared and Visual Inspection of Photovoltaic Installations by UAV Photogrammetry—Application Case: Morocco',\n",
       "  'creators': 'ZefriYahya',\n",
       "  'abstractNote': 'Being sustainable， clean， and eco-friendly， photovoltaic technology is considered as one of the most hoped solutions face to worldwide energetic challenges. Morocco joins this context with the inauguration of numerous clean energy projects. However， one key factor in making photovoltaic installations a profitable investment are regular and effective inspections in order to detect occurred defects. Unmanned aerial vehicles (UAV) are increasingly used in various inspection fields. In this respect， this work focuses on the use of thermal and visual imagery taken by UAV in the inspection of photovoltaic installations. Visual and thermal images of photovoltaic modules， obtained by UAV， from different installations， and with different acquisition conditions and parameters， were exploited to generate orthomosaics for inspection purposes. The methodology was tested on a dataset we have acquired by a mission in Rabat (Morocco)， and also on external datasets acquired in Switzerland. As final results， several visual defects were detected in visual RGB and thermal orthomosaics， such as cracks， soiling， and hotspots. In addition， a procedure of semi-automatic hotspots’ extraction was also developed and is presented within this work. On the other side， various tests were conducted on the influence of some acquisition and processing parameters (images’ overlap， the ground sampling distance， the flying height， the use of ground control points， the internal camera parameters’ optimization) on the detection of defects and the quality of visual and thermal generated orthomosaics. In the end， the potential of UAV thermal and visual imagery in the inspection of photovoltaic installations was discussed in function of various parameters. On the basis of the discussion feedback， UAV were concluded as advantageous tools within the thematic of this project， which proves the necessity of their implementation in this context.',\n",
       "  'publicationTitle': 'Drones',\n",
       "  'date': '2018/12',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mdpi.com/2504-446X/2/4/41',\n",
       "  'libraryCatalog': 'www.mdpi.com',\n",
       "  'tags': ['UAV',\n",
       "   'automatic detection',\n",
       "   'defects',\n",
       "   'photovoltaic installation',\n",
       "   'thermal infrared inspection',\n",
       "   'visual inspection']},\n",
       " {'title': 'Automatic supervision and fault detection of PV systems based on power losses analysis',\n",
       "  'creators': 'ChouderA.',\n",
       "  'abstractNote': 'In this work， we present a new automatic supervision and fault detection procedure for PV systems， based on the power losses analysis. This automatic supervision system has been developed in Matlab&Simulink environment. It includes parameter extraction techniques to calculate main PV system parameters from monitoring data in real conditions of work， taking into account the environmental irradiance and module temperature evolution， allowing simulation of the PV system behaviour in real time. The automatic supervision method analyses the output power losses， presents in the DC side of the PV generator， capture losses. Two new power losses indicators are deﬁned: thermal capture losses (Lct) and miscellaneous capture losses (Lcm). The processing of these indicators allows the supervision system to generate a faulty signal as indicator of fault detection in the PV system operation.',\n",
       "  'publicationTitle': 'Energy Conversion and Management',\n",
       "  'date': '10/2010',\n",
       "  'language': 'en',\n",
       "  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0196890410000919',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'A comprehensive study on different types of faults and detection techniques for solar photovoltaic system',\n",
       "  'creators': 'MadetiSiva Ramakrishna',\n",
       "  'abstractNote': 'Monitoring systems are essential to maintain optimal performance of photovoltaic (PV) systems. A critical aspect in such monitoring systems is the fault diagnosis technique being used. The role of a fault detection and diagnosis technique is to identify the causes affecting the real-time energy production and/or smooth functioning of PV systems. Over the past decade， various fault detection methods were reported in literature. Among all the fault detection techniques reported， some paid significant attention only on faults that occur in the PV system， some on faults on DC side of the PV system while the rest focused on AC side faults. For the first time， this paper provides a comprehensive review of popular fault detection techniques， addressing all major types of faults in PV systems. Detailed insights of PV fault detection techniques along with their relative performances are covered. A new fault detection technique is also proposed to identify the type and location (module level) of a fault. This review enables the reader to get acquaintance with major aspects/considerations in developing/choosing an effective yet viable fault detection technique for small and medium scale PV systems.',\n",
       "  'publicationTitle': 'Solar Energy',\n",
       "  'date': '2017-12-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X17307508',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Different faults',\n",
       "   'Fault detection techniques',\n",
       "   'Photovoltaic systems']},\n",
       " {'title': 'Identifying PV Module Mismatch Faults by a Thermography-Based Temperature Distribution Analysis',\n",
       "  'creators': 'HuYihua',\n",
       "  'abstractNote': 'Photovoltaic (PV) solar power generation is proven to be effective and sustainable but is currently hampered by relatively high costs and low conversion efficiency. This paper addresses both issues by presenting a low-cost and efficient temperature distribution analysis for identifying PV module mismatch faults by thermography. Mismatch faults reduce the power output and cause potential damage to PV cells. This paper first defines three fault categories in terms of fault levels， which lead to different terminal characteristics of the PV modules. The investigation of three faults is also conducted analytically and experimentally， and maintenance suggestions are also provided for different fault types. The proposed methodology is developed to combine the electrical and thermal characteristics of PV cells subjected to different fault mechanisms through simulation and experimental tests. Furthermore， the fault diagnosis method can be incorporated into the maximum power point tracking schemes to shift the operating point of the PV string. The developed technology has improved over the existing ones in locating the faulty cell by a thermal camera， providing a remedial measure， and maximizing the power output under faulty conditions.',\n",
       "  'publicationTitle': 'IEEE Transactions on Device and Materials Reliability',\n",
       "  'date': '2014-12',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/6879295',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Arrays',\n",
       "   'Cameras',\n",
       "   'Circuit faults',\n",
       "   'Degradation',\n",
       "   'Fault diagnosis',\n",
       "   'Lighting',\n",
       "   'Temperature distribution',\n",
       "   'Temperature measurement',\n",
       "   'fault diagnosis',\n",
       "   'photovoltaic (PV) power systems',\n",
       "   'temperature',\n",
       "   'thermography']},\n",
       " {'title': 'Deep residual network based fault detection and diagnosis of photovoltaic arrays using current-voltage curves and ambient conditions基于深度残差网络的故障检测，以及使用电流-电压曲线和环境条件对光伏阵列进行诊断',\n",
       "  'creators': 'ChenZhicong',\n",
       "  'abstractNote': 'Automatic fault detection and diagnosis techniques for photovoltaic arrays are crucial to promote the efficiency， reliability and safety of photovoltaic systems. In recent decades， many conventional artificial intelligence approaches have been successfully applied to automatically establish fault detection and diagnosis model using fault data samples， but most of them rely on manual feature extraction or expert knowledge to build diagnosis models， which is inefficient and may ignore some potential useful features. In addition， they usually use shallow neural networks with limited performance. Addressing the issues， this paper proposes a novel intelligent fault detection and diagnosis method for photovoltaic arrays based on a newly designed deep residual network model trained by the adaptive moment estimation deep learning algorithm， which can automatically extract features from raw current-voltage curves and ambient irradiance and temperature， and effectively improve the performance with a deeper network. In order to validate the proposed fault diagnosis model， a Simulink based simulation model is designed for a real laboratory photovoltaic array， and both fault simulation and real experiments are carried out to obtain simulation and experimental fault datasets. Furthermore， two other popular deep learning based models are used for comparison， including convolution neural network and convolutional auto-encoder. Both of simulation and real experimental comparison results demonstrate that the proposed deep residual network based method achieves high and best overall performance in terms of accuracy， generalization performance， reliability and training efficiency.',\n",
       "  'publicationTitle': 'Energy Conversion and Management',\n",
       "  'date': '2019-10-15',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0196890419307757',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Current-voltage characteristic curves',\n",
       "   'Deep learning',\n",
       "   'Deep residual networks',\n",
       "   'Fault detection and diagnosis',\n",
       "   'Photovoltaic arrays']},\n",
       " {'title': 'Reliable fault detection and diagnosis of photovoltaic systems based on statistical monitoring approaches',\n",
       "  'creators': 'HarrouFouzi',\n",
       "  'abstractNote': '',\n",
       "  'publicationTitle': 'Renewable Energy',\n",
       "  'date': '02/2018',\n",
       "  'language': 'en',\n",
       "  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0960148117309114',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'FBRT-YOLO: Faster and Better for Real-Time Aerial Image Detection',\n",
       "  'creators': 'XiaoYao',\n",
       "  'abstractNote': 'Embedded flight devices with visual capabilities have become essential for a wide range of applications. \\nIn aerial image detection， while many existing methods have partially addressed the issue of small target detection， challenges remain in optimizing small target detection and balancing detection accuracy with efficiency.\\nThese issues are key obstacles to the advancement of real-time aerial image detection.\\nIn this paper， we propose a new family of real-time detectors for aerial image detection， named FBRT-YOLO， to address the imbalance between detection accuracy and efficiency. Our method comprises two lightweight modules: Feature Complementary Mapping Module (FCM) and Multi-Kernel Perception Unit (MKP)， designed to enhance object perception for small targets in aerial images.\\nFCM focuses on alleviating the problem of information imbalance caused by the loss of small target information in deep networks. It aims to integrate spatial positional information of targets more deeply into the network， better aligning with semantic information in the deeper layers to improve the localization of small targets.\\nWe introduce MKP， which leverages convolutions with kernels of different sizes to enhance the relationships between targets of various scales and improve the perception of targets at different scales.\\nExtensive experimental results on three major aerial image datasets， including Visdrone， UAVDT， and AI-TOD， demonstrate that FBRT-YOLO outperforms various real-time detectors in terms of performance and speed.',\n",
       "  'publicationTitle': 'Proceedings of the AAAI Conference on Artificial Intelligence',\n",
       "  'date': '2025-04-11',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ojs.aaai.org/index.php/AAAI/article/view/32937',\n",
       "  'libraryCatalog': 'ojs.aaai.org',\n",
       "  'tags': []},\n",
       " {'title': 'Fast fault detection method for photovoltaic arrays with adaptive deep multiscale feature enhancement',\n",
       "  'creators': 'GongBin',\n",
       "  'abstractNote': 'Photovoltaic (PV) arrays have output characteristics such as randomness and intermittency， and faults can seriously affect the safe operation of the power system. In order to improve the comprehensive performance of the PV array fault diagnosis model， a new intelligent online fault monitoring method for PV arrays is proposed in this paper. (1) a three-dimensional channel feature map based on I， V， and P features is constructed because the I-V and P curves of the PV array have significantly different effects under different fault conditions. (2) The PV array fault diagnosis model based on a multi-source information fusion network (MIFNet) is proposed， and Channel Mixing Convolution (CMC) module， three-dimensional feature attention enhancement (TDFAE) module， and Channel normalized scaling (CNS) module are designed to improve the comprehensive performance of the model. (3) An adaptive nonlinear mutual sparrow search algorithm (ANMSSA) is proposed to optimize the hyperparameter configuration of the MIFNet network. The experimental results show that the average recognition accuracy， prediction accuracy， and sensitivity of the ANMSSA-MIFNet network proposed in this paper are 99.64%， 99.64%， and 99.71% respectively. When facing single-component faults and multi-component faults， the model has stronger diagnostic accuracy， robustness， anti-noise ability， and stability， and can efficiently diagnose different faults of PV arrays， providing the scientific basis and theoretical support for the operation of PV systems.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2024-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0306261923014356',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Fault diagnosis',\n",
       "   'Improved sparrow optimization algorithm',\n",
       "   'Multi-scale feature fusion',\n",
       "   'Photovoltaic arrays',\n",
       "   'Three-dimensional feature attention enhancement module']},\n",
       " {'title': 'An unsupervised hourly weather status pattern recognition and blending fitting model for PV system fault detection',\n",
       "  'creators': 'QuJiaqi',\n",
       "  'abstractNote': 'Detecting PV system faults in a timely fashion is important to ensure the safe operation of equipment and reduce their impact on the economy of the PV systems. It is necessary to further improve the time-sensitive performance evaluation of the system. However， the hourly weather scenario segmentations are seldom considered during the hour-level online monitoring process. Therefore， a hybrid method based on unsupervised hourly weather status pattern recognition and blending fitting model is proposed for hourly fault detection to improve the performance evaluation of PV systems. The proposed solution includes three parts， firstly， in the data preprocessing stage， the measured power with the errors and noise under normal operation situation caused by the environment changes is corrected by monthly linear fitting. Secondly， an unsupervised hourly weather status pattern recognition method is constructed using the measured radiation data， including unsupervised clustering and the Multiclass-GBDT-LR classification process. Finally， after eliminating the anomalies and errors， the blending fitting model of the hourly sub-weather status is established. Through the analysis of power plants in Australia and China， the proposed solutions are validated and evaluated to be superior to existing data-driven solutions in terms of fitting accuracy， detection validity， and response time. Numerical results of case studies indicate that the developed methodology under sub-weather has improved the detection accuracy up to 97.71% and 99.29% compared to benchmark models.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2022-08-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0306261922006286',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Blending fitting model',\n",
       "   'Hourly fault detection',\n",
       "   'Photovoltaic systems performance',\n",
       "   'Unsupervised hourly weather status pattern recognition']},\n",
       " {'title': 'Photovoltaic Bypass Diode Fault Detection Using Artificial Neural Networks',\n",
       "  'creators': 'DhimishMahmoud',\n",
       "  'abstractNote': 'Due to the importance of determining faulty bypass diodes in photovoltaic (PV) systems， faulty bypass diodes have been of widespread interest in recent years due to their importance in improving PV system durability， operation， and overall safety. This article presents new work in developing an artificial intelligence (AI) based model using the principles of artificial neural networks (ANNs) to detect short and open PV bypass diode fault conditions. With only three inputs from the PV system， namely， the output power， short-circuit current， and open-circuit voltage， the developed ANN model can determine whether the PV bypass diodes are defective. In the experimentally validated case of short and open bypass diodes， 93.6% and 93.3% of faulty bypass diodes can be detected. Furthermore， the developed ANN model has an average precision and sensitivity of 96.4% and 92.6%， respectively.',\n",
       "  'publicationTitle': 'IEEE Transactions on Instrumentation and Measurement',\n",
       "  'date': '2023',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10042455',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Artificial intelligence (AI)',\n",
       "   'Artificial neural networks',\n",
       "   'Circuit faults',\n",
       "   'Fault diagnosis',\n",
       "   'Neurons',\n",
       "   'Photovoltaic cells',\n",
       "   'Photovoltaic systems',\n",
       "   'Temperature measurement',\n",
       "   'bypass diodes',\n",
       "   'fault detection algorithm',\n",
       "   'photovoltaics (PVs)']},\n",
       " {'title': 'Faults detection and diagnosis of PV systems based on machine learning approach using random forest classifier',\n",
       "  'creators': 'AmiriAhmed Faris',\n",
       "  'abstractNote': 'Accurate and reliable fault detection procedures are crucial for optimizing photovoltaic (PV) system performance. Establishing a trustworthy PV array model is the primary step and a vital tool for monitoring and diagnosing PV systems. This paper outlines a two-step approach for creating a reliable PV array model and implementing a fault detection procedure using Random Forest Classifiers (RFCs). Firstly， we extracted the five unknown parameters of the one-diode model (ODM) by combining the current–voltage translation method to predict the reference curve and employing the modified grey wolf optimization (MGWO) algorithm. In the second step， we simulated the PV array to obtain maximum power point (MPP) coordinates and construct operational databases through co-simulations in PSIM/MATLAB. We developed two RFCs: one for fault detection (a binary classifier) and another for fault diagnosis (a multiclass classifier). Our results confirmed the accuracy of the PV array modeling approach. We achieved a root mean square error (RMSE) value of 0.0122 for the ODM parameter extraction and RMSEs lower than 0.3 in dynamic PV array output current simulations under cloudy conditions. Regarding the fault detection procedure， our results demonstrate exceptional classification accuracy rates of 99.4% for both fault detection and diagnosis， surpassing other tested models like Support Vector Machines (SVM)， K-Nearest Neighbors (KNN)， Neural Networks (MLP Classifier)， Decision Trees (DT)， and Stochastic Gradient Descent (SGDC).',\n",
       "  'publicationTitle': 'Energy Conversion and Management',\n",
       "  'date': '2024-02-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0196890424000177',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Co-simulation',\n",
       "   'Dynamic MPP model',\n",
       "   'Fault detection',\n",
       "   'MGWO algorithm',\n",
       "   'Parameter extraction',\n",
       "   'Random forest classifier']},\n",
       " {'title': 'Failures of Photovoltaic modules and their Detection: A Review',\n",
       "  'creators': 'Waqar AkramM.',\n",
       "  'abstractNote': 'Photovoltaic (PV) has emerged as a promising and phenomenal renewable energy technology in the recent past and the PV market has developed at an exponential rate during the time. However， a large number of early failure and degradation cases are also observed in the field. Besides these， there are fire risks associated with PV modules installed in the field， roof-mounted and building integrated PV systems， as modules contain combustible materials. The fire is caused by different failures and faults such as electrical arcs， short circuits， and hotspots. The timely， fast and accurate detection and measurement of failures is important to produce efficient and durable modules. Conventional visual monitoring and assessment process is commonly used in the field， which is mainly dependent upon human abilities and often involve human error. Moreover， it is only practicable on small-scale and requires long time. With the rising use of PV solar energy and ongoing installation of large-scale PV power plants worldwide， the automation of PV monitoring and assessment methods becomes important. Here， the present paper focuses on module failures， fire risks associated with PV modules， failure detection/measurements， and computer/machine vision or artificial intelligence (AI) based failure detection in PV modules; and can serve as a one-stop source for PV system inspectors. All types of failures occurred in PV modules including recent reported field failures are discussed in the paper. The fire risks associated with PV modules and reduction of fire risks and hotspots is also discussed. Different failure detection methods and recent advancements in these methods are presented. The strengths and limitations of each method is summarized. Moreover， the studies conducted on combined application and comparison of different methods are extensively reviewed. The boundary conditions of applications of different failure detection methods are provided which helps in selection of appropriate method. Subsequent to this， automatic techniques are introduced and their implementation and applications are discussed. The strengths and limitations of different automatic techniques and their applicability with respect to different conditions is discussed. This study may act as a one-stop guide for: acquiring information about module structure and failures， mitigation of fire risks and hotspots， selection of appropriate characterization method， application of different methods， automation of detection tasks， and remote PV plant inspection. The PV sector is at the start of AI journey and has a long path to go. The present paper is a significant step in the AI journey. The existing knowledge is organized systematically in a handy manner， thereby can facilitates new developments in AI-related research， fire risks mitigation， and failure detection.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2022-05-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0306261922002677',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Artificial intelligence and deep learning',\n",
       "   'Defect detection',\n",
       "   'Electroluminescence and Infrared imaging',\n",
       "   'Module failures and fire risks',\n",
       "   'Photovoltaic cells']},\n",
       " {'title': 'An innovative transformer neural network for fault detection and classification for photovoltaic modules',\n",
       "  'creators': 'RamadanE. A.',\n",
       "  'abstractNote': 'Solar energy from photovoltaic systems (PV) ranks as the third greatest renewable electricity generation resource， expanding quickly through the years as it is free from environmental pollution and has cheap installation costs. Effective performance at high working rates is contingent on the early failure detection of PV modules. This study introduces an innovative deep learning model utilizing a Vision Transformer (ViT) artificial neural network (ANN) for the automatic detection of faults in infrared thermography (IR) images of PV modules. Our approach aims to enhance the accuracy of PV fault detection and classification compared to existing deep learning methods. The proposed framework encompasses three primary stages: (1) image preprocessing， which includes the application of the unsharp mask to sharpen the image’s edges or high-frequency components; (2) data augmentation techniques designed to overcome the problem of unbalanced classes that affect the training process， resulting in learning specific majority classes better than others; and (3) implementing a Vision Transformer deep learning model for its precision in digital image analysis. We evaluated the framework using the public Infrared Solar Modules dataset. The performance was quantitatively assessed using several metrics: accuracy， recall， precision， and F1 score. The dataset is classified into eleven different PV anomalies and another class of no-anomaly PV modules. The results show that our proposed approach has 98.23% accuracy for classifying the dataset into two classes， one for the PV anomaly and the other for the no-anomaly. It also has 96.19% accuracy for classifying eleven PV failures and 95.55% for twelve classes， including the no-anomaly class with the eleven types of anomalies. The experimental results underscore the potential of our model for earlier and more precise detection of PV faults. Furthermore， comparative analysis revealed the superior performance of the proposed approach over other deep learning methods.',\n",
       "  'publicationTitle': 'Energy Conversion and Management',\n",
       "  'date': '2024-08-15',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0196890424006599',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Artificial Intelligence',\n",
       "   'Fault Detection System',\n",
       "   'Photovoltaic (PV) systems',\n",
       "   'Thermography',\n",
       "   'Vision Transformer']},\n",
       " {'title': 'Fault detection and monitoring systems for photovoltaic installations: A review',\n",
       "  'creators': 'Triki-LahianiAsma',\n",
       "  'abstractNote': \"As any energy production system， photovoltaic (PV) installations have to be monitored to enhance system performances and to early detect failures for more reliability. There are several photovoltaic monitoring strategies based on the output of the plant and its nature. Monitoring can be performed locally on site or remotely. It measures production， focuses also on verification and follow-up of converter and communication devices' effective operation. Up to now， some faults diagnosis methods for PV components and systems have been developed. However， given the evolution of PV installations， more advanced monitoring techniques are continuously under investigation. In this paper， major photovoltaic system failures are addressed. Then techniques for photovoltaic monitoring proposed in recent literature are overviewed and analyzed to point out their differences， advantages and limits.\",\n",
       "  'publicationTitle': 'Renewable and Sustainable Energy Reviews',\n",
       "  'date': '2018-02-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S1364032117313618',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Diagnosis', 'Fault detection', 'PV monitoring', 'PV systems']},\n",
       " {'title': 'Application of Artificial Neural Networks to photovoltaic fault detection and diagnosis: A review',\n",
       "  'creators': 'LiB.',\n",
       "  'abstractNote': 'The rapid development of photovoltaic (PV) technology and the growing number and size of PV power plants require increasingly efficient and intelligent health monitoring strategies to ensure reliable operation and high energy availability. Among the various techniques， Artificial Neural Network (ANN) has exhibited the functional capacity to perform the identification and classification of PV faults. In the present review， a systematic study on the application of ANN and hybridized ANN models for PV fault detection and diagnosis (FDD) is conducted. For each application， the targeted PV faults， the detectable faults， the type and amount of data used， the model configuration and the FDD performance are extracted， and analyzed. The main trends， challenges and prospects for the application of ANN for PV FDD are extracted and presented.',\n",
       "  'publicationTitle': 'Renewable and Sustainable Energy Reviews',\n",
       "  'date': '2021-03-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S136403212030798X',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Artificial neural network',\n",
       "   'Deep learning',\n",
       "   'Fault classification',\n",
       "   'Fault detection',\n",
       "   'Machine learning',\n",
       "   'Photovoltaic']},\n",
       " {'title': 'MambaSOD: Dual Mamba-driven cross-modal fusion network for RGB-D Salient Object Detection',\n",
       "  'creators': 'ZhanYue',\n",
       "  'abstractNote': 'The purpose of RGB-D Salient Object Detection (SOD) is to pinpoint the most visually conspicuous areas within images accurately. Numerous conventional models heavily rely on CNN and overlook the long-range contextual dependencies， subsequent transformer-based models have addressed the issue to some extent but introduce quadratic computational complexity. Moreover， incorporating spatial information from depth maps has been proven effective for this task and the primary challenge is how to effectively fuse the complementary information from RGB and depth. Recent advancements in Mamba， particularly its superior ability to perform long-range modeling within linear efficiency， have motivated our exploration of its potential in the RGB-D SOD task. In this paper， we propose a dual Mamba-driven cross-modal fusion network for RGB-D SOD， named MambaSOD， which effectively leverages Mamba’s long-range dependency modeling capability. Specifically， we employ a dual Mamba-driven feature extractor to process RGB and depth inputs to obtain features with global contextual information. Then， we design a cross-modal fusion Mamba to perform modality-specific feature enhancement and model the inter-modal correlation between the RGB and depth features. To the best of our knowledge， this work is an innovative attempt to explore the potential of the pure Mamba in the RGB-D SOD task， offering a novel perspective. Numerous experiments conducted on seven prevailing datasets demonstrate our method’s superiority over eighteen state-of-the-art RGB-D SOD models. The source code will be released at https://github.com/YueZhan721/MambaSOD.',\n",
       "  'publicationTitle': 'Neurocomputing',\n",
       "  'date': '2025-05-28',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S092523122500390X',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Cross-modal Fusion Mamba',\n",
       "   'Mamba-based backbone',\n",
       "   'RGB-D salient object detection',\n",
       "   'State Space Model']},\n",
       " {'title': 'FM-RTDETR: Small Object Detection Algorithm Based on Enhanced Feature Fusion with Mamba',\n",
       "  'creators': 'YangYuchuan',\n",
       "  'abstractNote': \"Traditional real-time object detection networks deployed in unmanned aerial vehicles (UAVs) struggle to extract features from small objects in complex backgrounds with occlusions and overlapping objects. To address this challenge， we propose FM-RTDETR， a real-time object detection algorithm optimized for small object detection. We redesign the encoder of RT-DETRv2 by integrating the Feature Aggregation and Diffusion Network (FADN)， improving the algorithm's ability to capture contextual information. Subsequently， we introduce the Parallel Atrous Mamba Feature Fusion Module (PAMFFM)， which combines shallow and deep semantic information to better capture small object features. Furthermore， we propose the Cross-stage Enhanced Feature Fusion Module (CEFFM)， merging features for small objects to provide richer and more detailed information. Finally， we propose STIoU Loss， which incorporates a penalty term to adjust the scaling of the loss function， thereby improving detection granularity for small objects. FM-RTDETR achieves AP<sub>50</sub> scores of 54.0% and 56.3% on the VisDrone2019-DET and AI-TOD datasets. Compared with other state-of-the-art methods， our method shows great potential in small object detection from drones. The code is available at https://github.com/Yyc1999super/FM-RTDETR.\",\n",
       "  'publicationTitle': 'IEEE Signal Processing Letters',\n",
       "  'date': '2025',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10935299/',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Accuracy',\n",
       "   'Artificial intelligence',\n",
       "   'Convolution',\n",
       "   'Decoding',\n",
       "   'Feature extraction',\n",
       "   'Feature fusion',\n",
       "   'Object detection',\n",
       "   'RT-DETRv2',\n",
       "   'Real-time systems',\n",
       "   'Signal processing algorithms',\n",
       "   'Small object detection',\n",
       "   'Training',\n",
       "   'Transformers',\n",
       "   'VisDrone']},\n",
       " {'title': 'MetaFormer Baselines for Vision',\n",
       "  'creators': 'YuWeihao',\n",
       "  'abstractNote': \"MetaFormer， the abstracted architecture of Transformer， has been found to play a significant role in achieving competitive performance. In this paper， we further explore the capacity of MetaFormer， again， without focusing on token mixer design: we introduce several baseline models under MetaFormer using the most basic or common mixers， and summarize our observations as follows: (1) MetaFormer ensures solid lower bound of performance. By merely adopting identity mapping as the token mixer， the MetaFormer model， termed IdentityFormer， achieves >80% accuracy on ImageNet-1K. (2) MetaFormer works well with arbitrary token mixers. When specifying the token mixer as even a random matrix to mix tokens， the resulting model RandFormer yields an accuracy of >81%， outperforming IdentityFormer. Rest assured of MetaFormer's results when new token mixers are adopted. (3) MetaFormer effortlessly offers state-of-the-art results. With just conventional token mixers dated back five years ago， the models instantiated from MetaFormer already beat state of the art. (a) ConvFormer outperforms ConvNeXt. Taking the common depthwise separable convolutions as the token mixer， the model termed ConvFormer， which can be regarded as pure CNNs， outperforms the strong CNN model ConvNeXt. (b) CAFormer sets new record on ImageNet-1K. By simply applying depthwise separable convolutions as token mixer in the bottom stages and vanilla self-attention in the top stages， the resulting model CAFormer sets a new record on ImageNet-1K: it achieves an accuracy of 85.5% at 224x224 resolution， under normal supervised training without external data or distillation. In our expedition to probe MetaFormer， we also find that a new activation， StarReLU， reduces 71% FLOPs of activation compared with GELU yet achieves better performance. We expect StarReLU to find great potential in MetaFormer-like models alongside other neural networks.\",\n",
       "  'publicationTitle': 'IEEE Transactions on Pattern Analysis and Machine Intelligence',\n",
       "  'date': '2/2024',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2210.13452',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'Omni-Kernel Network for Image Restoration',\n",
       "  'creators': 'CuiYuning',\n",
       "  'abstractNote': 'Image restoration aims to reconstruct a high-quality image from a degraded low-quality observation. Recently， Transformer models have achieved promising performance on image restoration tasks due to their powerful ability to model long-range dependencies. However， the quadratically growing complexity with respect to the input size makes them inapplicable to practical applications. In this paper， we develop an efficient convolutional network for image restoration by enhancing multi-scale representation learning. To this end， we propose an omni-kernel module that consists of three branches， i.e.， global， large， and local branches， to learn global-to-local feature representations efficiently. Specifically， the global branch achieves a global perceptive field via the dual-domain channel attention and frequency-gated mechanism. Furthermore， to provide multi-grained receptive fields， the large branch is formulated via different shapes of depth-wise convolutions with unusually large kernel sizes. Moreover， we complement local information using a point-wise depth-wise convolution. Finally， the proposed network， dubbed OKNet， is established by inserting the omni-kernel module into the bottleneck position for efficiency. Extensive experiments demonstrate that our network achieves state-of-the-art performance on 11 benchmark datasets for three representative image restoration tasks， including image dehazing， image desnowing， and image defocus deblurring. The code is available at https://github.com/c-yn/OKNet.',\n",
       "  'publicationTitle': 'Proceedings of the AAAI Conference on Artificial Intelligence',\n",
       "  'date': '2024-03-24',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ojs.aaai.org/index.php/AAAI/article/view/27907',\n",
       "  'libraryCatalog': 'ojs.aaai.org',\n",
       "  'tags': ['CV: Representation Learning for Vision']},\n",
       " {'title': 'Optimization and Validation of Wafer Surface Defect Detection Algorithm Based on RT-DETR',\n",
       "  'creators': 'XuAo',\n",
       "  'abstractNote': 'In response to the issue of poor detection performance on wafer surface defect spots and elongated scratches， an improved RT-DETR method for wafer surface defect detection is proposed. Firstly， a dynamic snake convolutional layer is introduced to detect elongated scratches where conventional convolutional kernels fail to extract features effectively. Secondly， to address the problem of information loss in small targets， an attention-based Transformer encoder module and a feature fusion network based on residual thinking are proposed. Finally， verification is conducted using a wafer test dataset. Experimental results demonstrate that compared to the original RT-DETR method， the model exhibits a 4.1% improvement in detecting small particles and a 5.4% improvement in scratch detection performance. Fully meeting the requirements of intelligent manufacturing and high detection accuracy.',\n",
       "  'publicationTitle': 'IEEE Access',\n",
       "  'date': '2025',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10892113/?arnumber=10892113',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Accuracy',\n",
       "   'Computational modeling',\n",
       "   'Data mining',\n",
       "   'Defect detection',\n",
       "   'Defects detection',\n",
       "   'Feature extraction',\n",
       "   'Kernel',\n",
       "   'Semiconductor device modeling',\n",
       "   'Shape',\n",
       "   'Standards',\n",
       "   'Transformers',\n",
       "   'deep learning',\n",
       "   'object detection']},\n",
       " {'title': 'Foreign Object Shading Detection in Photovoltaic Modules Based on Transfer Learning',\n",
       "  'creators': 'LiuBin',\n",
       "  'abstractNote': 'As a representative new energy source， solar energy has the advantages of easy access to resources and low pollution. However， due to the uncertainty of the external environment， photovoltaic (PV) modules that collect solar energy are often covered by foreign objects in the environment such as leaves and bird droppings， resulting in a decrease in photoelectric conversion efficiency， power losses， and even the “hot spot” phenomenon， resulting in damage to the modules. Existing methods mostly inspect foreign objects manually， which not only incurs high labor costs but also hinders real-time monitoring. To address these problems， this paper proposes an IDETR deep learning target detection model based on Deformable DETR combined with transfer learning and a convolutional block attention module， which can identify foreign object shading on the surfaces of PV modules in actual operating environments. This study contributes to the optimal operation and maintenance of PV systems. In addition， this paper collects data in the field and constructs a dataset of foreign objects of PV modules. The results show that the advanced model can significantly improve the target detection AP values.',\n",
       "  'publicationTitle': 'Energies',\n",
       "  'date': '2023/1',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mdpi.com/1996-1073/16/7/2996',\n",
       "  'libraryCatalog': 'www.mdpi.com',\n",
       "  'tags': ['convolutional block attention module',\n",
       "   'foreign object shading detection',\n",
       "   'photovoltaic module',\n",
       "   'transfer learning']},\n",
       " {'title': 'PD-DETR: towards efficient parallel hybrid matching with transformer for photovoltaic cell defects detection',\n",
       "  'creators': 'ZhaoLangyue',\n",
       "  'abstractNote': 'Defect detection for photovoltaic (PV) cell images is a challenging task due to the small size of the defect features and the complexity of the background characteristics. Modern detectors rely mostly on proxy learning objectives for prediction and on manual post-processing components. One-to-one set matching is a critical design for DEtection TRansformer (DETR) in order to provide end-to-end capability， so that does not need a hand-crafted Efficient Non-Maximum Suppression NMS. In order to detect PV cell defects faster and better， a technology called the PV cell Defects DEtection Transformer (PD-DETR) is proposed. To address the issue of slow convergence caused by DETR’s direct translation of image feature mapping into target detection results， we created a hybrid feature module. To achieve a balance between performance and computation， the image features are passed through a scoring network and dilated convolution， respectively， to obtain the foreground fine feature and contour high-frequency feature. The two features are then adaptively intercepted and fused. The capacity of the model to detect small-scale defects under complex background conditions is improved by the addition of high-frequency information. Furthermore， too few positive queries will be assigned to the defect target via one-to-one set matching， which will result in sparse supervision of the encoder and impair the decoder’s ability of attention learning. Consequently， we enhanced the detection effect by combining the original DETR with the one-to-many matching branch. Specifically， two Faster RCNN detection heads were added during training. To maintain the end-to-end benefits of DETR， inference is still performed using the original one-to-one set matching. Our model implements 64.7% AP on the PVEL-AD dataset.',\n",
       "  'publicationTitle': 'Complex & Intelligent Systems',\n",
       "  'date': '2024-12-01',\n",
       "  'language': 'en',\n",
       "  'url': 'https://doi.org/10.1007/s40747-024-01559-0',\n",
       "  'libraryCatalog': 'Springer Link',\n",
       "  'tags': ['DETR',\n",
       "   'Fine feature',\n",
       "   'High-frequency feature',\n",
       "   'One-to-many set matching',\n",
       "   'PV cell defects']},\n",
       " {'title': 'LW-PV DETR: lightweight model for photovoltaic panel surface defect detection',\n",
       "  'creators': 'HanTao',\n",
       "  'abstractNote': 'The photovoltaic industry is developing rapidly， and efficiently completing the operation and maintenance of photovoltaic systems has become a research hotspot， with photovoltaic panel defect detection being particularly critical. Due to factors such as the complex background of infrared images of photovoltaic panels taken by drones， the small proportion of defect areas， and equipment limitations， existing models face challenges in detection accuracy and deployment. Aiming at the three typical defects commonly found on the surface of photovoltaic (PV) panels， namely， shading， glass breakage and hot spots， a surface defect detection model (LW-PV DETR) for photovoltaic panels is proposed based on the Real-Time DEtection TRansformer (RT-DETR-R18) object detection model. In the backbone network， a lightweight and efficient attention feature extraction module (Faster-Rep-EMA Block) is designed to enhance the model feature extraction ability. In the Encoder， the lightweight convolution (GSConv) module is introduced to achieve model lightweighting. The feature focusing diffusion pyramid network (FFDPN) is proposed to enhance the model’s feature fusion capability. Simultaneously， to avoid the loss of small object features， a multi-level feature selective fusion (MLFSF) module is designed for feature focusing. For the loss function， Inner-IoU is introduced to improve the localization accuracy of bounding box regression. Experimental results on the public photovoltaic panel infrared image dataset GB_HSP_modified show that， compared to the baseline model， LW-PV DETR improves precision， recall， and mean Average Precision (mAP50， mAP50-95) by 3.9%， 18.6%， 18.5% and 10.9%， respectively， while the model’s parameter count is reduced by 11.83%. Compared to other mainstream object detection models， LW-PV DETR also demonstrates excellent detection performance， providing an important reference for research on intelligent detection of photovoltaic panel surface defects.',\n",
       "  'publicationTitle': 'Engineering Research Express',\n",
       "  'date': '2025-02',\n",
       "  'language': 'en',\n",
       "  'url': 'https://dx.doi.org/10.1088/2631-8695/adb4be',\n",
       "  'libraryCatalog': 'Institute of Physics',\n",
       "  'tags': []},\n",
       " {'title': 'A novel cost-function for transformerbased YOLO algorithm to detect photovoltaic panel defects',\n",
       "  'creators': 'TellaHambal',\n",
       "  'abstractNote': 'Solar panel defects can lead to substantial efficiency loss and increased maintenance expenses. Conventional defect detection methods are often slow and ineffective. Thisstudy revisits the You Only Look Once (YOLO) algorithm and its variations， assessing their efficacy in identifying defects in thermal images of solar panels. Subsequently， we introduce a novel YOLO algorithm， termed YOLOS-PV， built uponthe transformer-based YOLOS algorithm. The proposed algorithm introduces newloss function weights to prioritize localized objects and visualize the attention mapof each transformer head within the YOLOS algorithm. In the experiments， theYOLOS-PV achieves a mAP@0.5:0.95 score of 0.894， surpassing the efficiency ofother YOLO variants. Code implementation can be found here: tella26/YOLOS-PV (github.com).',\n",
       "  'publicationTitle': 'FME Transactions',\n",
       "  'date': '2024',\n",
       "  'language': 'en',\n",
       "  'url': 'https://scindeks.ceon.rs/Article.aspx?artid=1451-20922404639T',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'CCA-YOLO: Channel and Coordinate Aware-Based YOLO for Photovoltaic Cell Defect Detection in Electroluminescence Images',\n",
       "  'creators': 'BaoJunqi',\n",
       "  'abstractNote': 'Solar energy is a renewable energy used for urban power generation， contributing to sustainable cities. In solar energy generation， it is important to inspect the health of photovoltaic (PV) cells for safety and power transformation efficiency. Defects in PV cells are usually irregular with different scales， challenging automated defect detection for PV cells. Therefore， this article presents a channel and coordinate aware-based YOLO (CCA-YOLO) for efficient PV cell defect detection. Specifically， to provide accurate backbone features from the complex background defect images， the residual coordinate convolution-based ECA (RCC-ECA) enhances the backbone feature representation by learning from channel and coordinate information. To learn the intraclass/interclass variations and interclass similarity and convey coordinate information among different scales， the multiscale defect feature localization module (MDFLM) incorporates a larger backbone feature to improve the robustness of multiscale defects. The RCC-Up/Down optimizes the sampled features to minimize the inaccurate representation of the features caused by the sampling process. In addition， RCC-Up/Down conveys the coordinate information during the up/down sampling process to maintain coordinate awareness， which allows the network to learn from the coordinate information efficiently. Furthermore， the residual feature fusion with coordinate convolution-based CBAM (RFC-CBAM) is introduced to maintain the channel and coordinate awareness for efficient learning from fused features. The proposed CCA-YOLO outperforms state-of-the-art (SOTA) methods in PVEL-AD on precision (71.71%)， recall (76.91%)， F1-Scores (74.19%)， mAP50 (98.57%)， \\\\text AP_S (26.80%)， \\\\text AP_M (64.78%)， and \\\\text AP_L (74.93%).',\n",
       "  'publicationTitle': 'IEEE Transactions on Instrumentation and Measurement',\n",
       "  'date': '2025',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=10884963&ref=',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Accuracy',\n",
       "   'Computer architecture',\n",
       "   'Convolutional neural networks',\n",
       "   'Defect detection',\n",
       "   'Feature extraction',\n",
       "   'Industries',\n",
       "   'Photovoltaic cells',\n",
       "   'Shape',\n",
       "   'Transformers',\n",
       "   'YOLO',\n",
       "   'defect detection',\n",
       "   'electroluminescence images',\n",
       "   'photovoltaic (PV) cell']},\n",
       " {'title': 'A novel object recognition method for photovoltaic (PV) panel occlusion based on deep learning',\n",
       "  'creators': 'YuJing',\n",
       "  'abstractNote': 'During the long-term operation of the photovoltaic (PV) system， occlusion will reduce the solar radiation energy received by the PV module， as well as the photoelectric conversion efficiency and economy. However， the occlusion detection of the PV power station has the defects of low efficiency， poor accuracy， and untimely detection， which will cause unknown system losses. Based on the deep learning algorithm， this paper conducts research on PV module occlusion detection. In order to accurately obtain the occlusion area and position information of the PV panel， a PV module occlusion detection model based on the Segment-You Only Look Once (Seg-YOLO) algorithm is established. Based on the YOLOv5 algorithm， the loss function is modified， the Segment Head detection module is introduced， and the convolutional block attention module (CBAM) attention mechanism is added to achieve the accurate detection of small targets by the algorithm model and the fast detection of the PV module occlusion area identify. The model performance research is carried out on three types of occlusion datasets: leaf， bird dropping， and shadow. According to the experimental results， the proposed model has better recognition accuracy and speed than SSD， Faster-Rcnn， YOLOv4， and U-Net. The precision rate， recall rate， and recognition speed can reach 90.52%， 92.41%， and 92.3 FPS， respectively. This model can lay a theoretical foundation for the intelligent operation and maintenance of PV systems.',\n",
       "  'publicationTitle': 'Journal of Computational Methods in Sciences and Engineering',\n",
       "  'date': '2023-11-01',\n",
       "  'language': 'en',\n",
       "  'url': 'https://journals.sagepub.com/doi/abs/10.3233/JCM-237108',\n",
       "  'libraryCatalog': 'SAGE Journals',\n",
       "  'tags': []},\n",
       " {'title': 'Solar panel defect detection design based on YOLO v5 algorithm',\n",
       "  'creators': 'HuangJing',\n",
       "  'abstractNote': '',\n",
       "  'publicationTitle': 'Heliyon',\n",
       "  'date': '2023-08-01',\n",
       "  'language': 'English',\n",
       "  'url': 'https://www.cell.com/heliyon/abstract/S2405-8440(23)06034-6',\n",
       "  'libraryCatalog': 'www.cell.com',\n",
       "  'tags': ['Defect detection',\n",
       "   'Electrical safety',\n",
       "   'Solar panels',\n",
       "   'YOLO v5']},\n",
       " {'title': 'ST-YOLO: A defect detection method for photovoltaic modules based on infrared thermal imaging and machine vision technology',\n",
       "  'creators': 'XieHanfei',\n",
       "  'abstractNote': 'Photovoltaic panels are the core components of photovoltaic power generation systems， and their quality directly affects power generation efficiency and circuit safety. To address the shortcomings of existing photovoltaic defect detection technologies， such as high labor costs， large workloads， high sensor failure rates， low reliability， high false alarm rates， high network demands， and slow detection speeds of traditional algorithms， we propose an algorithm named ST-YOLO specifically for photovoltaic module defect detection. This algorithm is based on YOLOv8s. First， it introduces the C2f-SCconv convolution module， which is based on SCconv convolution. This module reduces the computational burden of model parameters and improves detection speed through lightweight design. Additionally， the Triplet Attention mechanism is incorporated， significantly enhancing detection accuracy without substantially increasing model parameter computations. Experiments on a self-built photovoltaic array infrared defect image dataset show that ST-YOLO， compared to the baseline YOLOv8s， achieves a 15% reduction in model weight， a 2.9% improvement in Precision， and a 1.4% increase in mAP@0.5. Compared to YOLOv7-Tiny and YOLOv5s， ST-YOLO also demonstrates superior detection performance and advantages. This indicates that ST-YOLO has significant application value in photovoltaic defect detection.',\n",
       "  'publicationTitle': 'PLOS ONE',\n",
       "  'date': '2024年12月12日',\n",
       "  'language': 'en',\n",
       "  'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0310742',\n",
       "  'libraryCatalog': 'PLoS Journals',\n",
       "  'tags': ['Algorithms',\n",
       "   'Alternative energy',\n",
       "   'Convolution',\n",
       "   'Imaging techniques',\n",
       "   'Machine learning algorithms',\n",
       "   'Photovoltaic power',\n",
       "   'Power stations',\n",
       "   'Sunlight']},\n",
       " {'title': 'PA-YOLO-Based Multifault Defect Detection Algorithm for PV Panels',\n",
       "  'creators': 'YinWang',\n",
       "  'abstractNote': 'In recent years， solar photovoltaic (PV) energy， as a clean energy source， has received widespread attention and experienced rapid growth worldwide. However， the rapid growth of PV power deployment also brings important challenges to the maintenance of PV panels， and in order to solve this problem， this paper proposes an innovative algorithm based on PA-YOLO. First， we propose to use PA-YOLO’s asymptotic feature pyramid network (AFPN) instead of YOLOv7’s backbone network to support direct interactions of nonadjacent layers and avoid large semantic gaps between nonadjacent layers. For the occlusion problem of dense targets in the dataset， we introduce a repulsive loss function， which successfully reduces the occurrence of false detection situations. Finally， we propose a customized convolutional block equipped with an EMA mechanism to enhance the perceptual and expressive capabilities of the model. Experimental results on the dataset show that our proposed model achieves excellent performance with an average accuracy (mAP) of 94.5%， which is 6.8% higher than YOLOv7. In addition， our algorithm also succeeds in drastically reducing the model size from 71.3 MB to 48.4 MB， which well demonstrates the effectiveness of the model.',\n",
       "  'publicationTitle': 'International Journal of Photoenergy',\n",
       "  'date': '2024',\n",
       "  'language': 'en',\n",
       "  'url': 'https://onlinelibrary.wiley.com/doi/abs/10.1155/2024/6113260',\n",
       "  'libraryCatalog': 'Wiley Online Library',\n",
       "  'tags': []},\n",
       " {'title': 'Enhanced photovoltaic panel defect detection via adaptive complementary fusion in YOLO-ACF',\n",
       "  'creators': 'PanWenwen',\n",
       "  'abstractNote': 'Detecting defects on photovoltaic panels using electroluminescence images can significantly enhance the production quality of these panels. Nonetheless， in the process of defect detection， there often arise instances of missed detections and false alarms due to the close resemblance between embedded defect features and the intricate background information. To tackle this challenge， we propose an Adaptive Complementary Fusion (ACF) module designed to intelligently integrate spatial and channel information. This module is seamlessly integrated into YOLOv5 for detecting defects on photovoltaic panels， aiming primarily to enhance model detection performance， achieve model lightweighting， and accelerate detection speed. In order to validate the efficacy of the proposed module， we conducted experiments using a dataset comprising 4500 electroluminescence images of photovoltaic panels. Compared to the cutting-edge detection capability of YOLOv8， our YOLO-ACF method exhibits enhancements of 5.2， 0.8， and 2.3 percentage points in R， mAP50， and mAP50-95， respectively. In contrast to the lightest and fastest YOLOv5， YOLO-ACF achieves reductions of 12.9%， 12.4%， and 4.2% in parameters， weight， and time， respectively， while simultaneously boosting FPS by 5%. Through qualitative and quantitative comparisons with various alternative methods， we demonstrate that our YOLO-ACF strikes a good balance between detection performance， model complexity， and detection speed for defect detection on photovoltaic panels. Moreover， it demonstrates remarkable versatility across a spectrum of defect types.',\n",
       "  'publicationTitle': 'Scientific Reports',\n",
       "  'date': '2024-11-02',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.nature.com/articles/s41598-024-75772-9',\n",
       "  'libraryCatalog': 'www.nature.com',\n",
       "  'tags': ['Computer science',\n",
       "   'Object vision',\n",
       "   'Photovoltaics',\n",
       "   'Solar energy and photovoltaic technology']},\n",
       " {'title': 'CEMP-YOLO: An infrared overheat detection model for photovoltaic panels in UAVs',\n",
       "  'creators': 'HongYan',\n",
       "  'abstractNote': 'Aiming at the complex working conditions of actual PV power stations， traditional PV panel detection methods employed by operators still result in some faults and safety risks. Under the framework of the YOLOv10n model， a CEMP-YOLOv10n-based infrared image detection algorithm for photovoltaic power plants is proposed. The improvements in CEMP-YOLOv10n comprise four main components. The ABCG_Block structure was designed， and the C2f structure within the Backbone component was optimized to enhance feature extraction capabilities. The ERepGFPN structure is used in the Neck component to retain semantic information and fuse features between high and low layers. The detector head was optimized with PConv convolution to minimize redundant information. Finally， MECA attention was added before P3， P4， and P5 detection heads to enhance adaptive recognition and accuracy.Experimental validation using infrared UAV imagery of PV panels shows the model’s computational cost decreased to 4.7 GFLOPs， 72.3 % of the original. Parameters and weights decreased by 25.99 % and 24.13 %， respectively， while accuracy and mean average precision (mAP) improved by 8.3% and 2 %， reaching 86.6 % and 87.3 %. Compared to 13 YOLO-series algorithms， including DETR， YOLOv8n， YOLOv9-tiny， and YOLOv11n， the CEMP-YOLOv10n model demonstrates superior accuracy， parameter efficiency， and memory consumption. The CEMP-YOLOv10n model significantly improves defect recognition accuracy， reduces missed detections， and balances lightweight design with detection speed. This lays the foundation for future UAV inspection edge device deployment and smart PV big data platform creation.',\n",
       "  'publicationTitle': 'Digital Signal Processing',\n",
       "  'date': '06/2025',\n",
       "  'language': 'en',\n",
       "  'url': 'https://linkinghub.elsevier.com/retrieve/pii/S1051200425000946',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'In-Depth Review of YOLOv1 to YOLOv10 Variants for Enhanced Photovoltaic Defect Detection',\n",
       "  'creators': 'HussainMuhammad',\n",
       "  'abstractNote': 'This review presents an investigation into the incremental advancements in the YOLO (You Only Look Once) architecture and its derivatives， with a specific focus on their pivotal contributions to improving quality inspection within the photovoltaic (PV) domain. YOLO’s single-stage approach to object detection has made it a preferred option due to its efficiency. The review unearths key drivers of success in each variant， from path aggregation networks to generalised efficient layer aggregation architectures and programmable gradient information， presented in the latest variant， YOLOv10， released in May 2024. Looking ahead， the review predicts a significant trend in future research， indicating a shift toward refining YOLO variants to tackle a wider array of PV fault scenarios. While current discussions mainly centre on micro-crack detection， there is an acknowledged opportunity for expansion. Researchers are expected to delve deeper into attention mechanisms within the YOLO architecture， recognising their potential to greatly enhance detection capabilities， particularly for subtle and intricate faults.',\n",
       "  'publicationTitle': 'Solar',\n",
       "  'date': '2024/9',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mdpi.com/2673-9941/4/3/16',\n",
       "  'libraryCatalog': 'www.mdpi.com',\n",
       "  'tags': ['YOLO',\n",
       "   'computer vision',\n",
       "   'convolutional neural networks',\n",
       "   'deep learning',\n",
       "   'object detection',\n",
       "   'photovoltaic',\n",
       "   'quality inspection: manufacturing']},\n",
       " {'title': 'PV-YOLO: Lightweight YOLO for Photovoltaic Panel Fault Detection',\n",
       "  'creators': 'YinWang',\n",
       "  'abstractNote': 'The rapid development of the photovoltaic industry in recent years has made the efficient and accurate completion of photovoltaic operation and maintenance a major focus in recent studies. The key to photovoltaic operation and maintenance is the accurate multifault identification of photovoltaic panel images collected using drones. In this paper， PV-YOLO is proposed to replace YOLOX’s backbone network， CSPDarknet53， with a transformer-based PVTv2 network to obtain local connections between images and feature maps to extract more edge-detail features of similar faults. The CBAM attention mechanism is added to enhance the effective features and improve the detection accuracy of small objects. The label assignment mechanism is optimized， and the SIoU loss functionis used to improve the uneven distribution of samples and accelerate network convergence. Experiments on the dataset prove that this method is superior to the existing technology， as the highest mAP value is 92.56%. This value is 10.46% higher than that of YOLOX， and the mAP is optimal under the same parameter magnitude，proving the model’s effectiveness.Moreover， mAP is increased by over 10%， especially for small targets. In this paper， we implemented a lightweight design for the model， and proposes four models of different sizes to be-sized models that are suitable for different detection scenarios.',\n",
       "  'publicationTitle': 'IEEE Access',\n",
       "  'date': '2023',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10032147',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Deep learning',\n",
       "   'Feature extraction',\n",
       "   'Mathematical models',\n",
       "   'Object detection',\n",
       "   'Photovoltaic panel failure',\n",
       "   'Photovoltaic systems',\n",
       "   'Transformers',\n",
       "   'YOLOX',\n",
       "   'lightweight',\n",
       "   'target detection',\n",
       "   'transformer']},\n",
       " {'title': 'Hybrid Deep Learning Model for Fault Detection and Classification of Grid-Connected Photovoltaic System',\n",
       "  'creators': 'AlrifaeyMoath',\n",
       "  'abstractNote': 'Effective fault detection and classification play essential roles in reducing the hazards such as electric shocks and fire in photovoltaic (PV) systems. However， the issues of interest in fault detection and classification for PV systems remain an open-ended challenge due to manual and time-consuming processes that require the relevant domain knowledge and experience of fault diagnoses. This paper proposes a hybrid deep-learning (DL) model-based combined architectures as the novel DL approaches to achieve the real-time automatic fault detection and classification of a PV system. This research employed the wavelet packet transform (WPT) as a data preprocessing technique to handle the PV voltage signal collected and feeding them as the inputs for combined DL architectures that consist of the equilibrium optimizer algorithm (EOA) and long short-term memory (LSTM-SAE) approaches. The combined DL architectures are able to extract the fault features automatically from the preprocessed data without requiring any previous knowledge， therefore can override the traditional shortages of manual feature extraction and manual selection of optimal features from the extracted fault features. These desirable features are anticipated to speed up the fault detection and classification capability of the proposed DL model with higher accuracy. In order to determine the performance of the proposed fault model， we carried out a comprehensive evaluation study on a 250-kW grid-connected PV system. In this paper， symmetrical and asymmetrical faults have been studied involving all the phases and ground faults such as single phase to ground， phases to phase， phase to phase to ground， and three-phase to ground. The simulation results validate the efficacy of the proposed model in terms of computation time， accuracy of fault detection， and noise robustness. Comprehensive comparisons between the simulation results and previous studies demonstrate the multidisciplinary applications of the present study.',\n",
       "  'publicationTitle': 'IEEE Access',\n",
       "  'date': '2022',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9668848/?arnumber=9668848',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Deep distributed energy',\n",
       "   'Fault detection',\n",
       "   'Feature extraction',\n",
       "   'Maintenance engineering',\n",
       "   'Photovoltaic systems',\n",
       "   'Renewable energy sources',\n",
       "   'Support vector machines',\n",
       "   'Wavelet packets',\n",
       "   'equilibrium optimizer algorithm (EOA)',\n",
       "   'fault detection and classification',\n",
       "   'grid-connected photovoltaic systems',\n",
       "   'optimal feature selection',\n",
       "   'wavelet packet transform (WPT)']},\n",
       " {'title': 'EfficientDet: Scalable and Efficient Object Detection',\n",
       "  'creators': 'TanMingxing',\n",
       "  'abstractNote': 'Model efﬁciency has become increasingly important in computer vision. In this paper， we systematically study neural network architecture design choices for object detection and propose several key optimizations to improve efﬁciency. First， we propose a weighted bi-directional feature pyramid network (BiFPN)， which allows easy and fast multi-scale feature fusion; Second， we propose a compound scaling method that uniformly scales the resolution， depth， and width for all backbone， feature network， and box/class prediction networks at the same time. Based on these optimizations and EfﬁcientNet backbones， we have developed a new family of object detectors， called EfﬁcientDet， which consistently achieve much better efﬁciency than prior art across a wide spectrum of resource constraints. In particular， with single-model and single-scale， our EfﬁcientDetD7 achieves state-of-the-art 52.2 AP on COCO test-dev with 52M parameters and 325B FLOPs1， being 4x – 9x smaller and using 13x – 42x fewer FLOPs than previous detector. Code is available at https://github.com/google/ automl/tree/master/efficientdet.',\n",
       "  'publicationTitle': '',\n",
       "  'date': '',\n",
       "  'language': 'en',\n",
       "  'url': '',\n",
       "  'libraryCatalog': 'Zotero',\n",
       "  'tags': []},\n",
       " {'title': 'MetaFormer Is Actually What You Need for Vision',\n",
       "  'creators': 'YuWeihao',\n",
       "  'abstractNote': 'Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However， recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation， we hypothesize that the general architecture of the transformers， instead of the specific token mixer module， is more essential to the model’s performance. To verify this， we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only basic token mixing. Surprisingly， we observe that the derived model， termed as PoolFormer， achieves competitive performance on multiple computer vi∗Work done during an internship at Sea AI Lab.',\n",
       "  'publicationTitle': '',\n",
       "  'date': '',\n",
       "  'language': 'en',\n",
       "  'url': '',\n",
       "  'libraryCatalog': 'Zotero',\n",
       "  'tags': []},\n",
       " {'title': 'Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks',\n",
       "  'creators': 'RenShaoqing',\n",
       "  'abstractNote': \"State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks， exposing region proposal computation as a bottleneck. In this work， we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network， thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals， which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms， the RPN component tells the unified network where to look. For the very deep VGG-16 model [3]， our detection system has a frame rate of 5 fps (including all steps) on a GPU， while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007， 2012， and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions， Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.\",\n",
       "  'publicationTitle': 'IEEE Transactions on Pattern Analysis and Machine Intelligence',\n",
       "  'date': '2017-06',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/7485869',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Convolutional codes',\n",
       "   'Detectors',\n",
       "   'Feature extraction',\n",
       "   'Object detection',\n",
       "   'Proposals',\n",
       "   'Search problems',\n",
       "   'Training',\n",
       "   'convolutional neural network',\n",
       "   'region proposal']},\n",
       " {'title': 'An efficient CNN-based detector for photovoltaic module cells defect detection in electroluminescence images',\n",
       "  'creators': 'LiuQing',\n",
       "  'abstractNote': 'Electroluminescence (EL) imaging provides a high spatial resolution for inspecting photovoltaic (PV) cells， enabling the detection of various types of PV cell defects. Recently， convolutional neural network (CNN) based automatic detection methods for PV cell defects using EL images have attracted much attention. However， existing methods struggle to achieve a good balance between detection accuracy and efficiency. To address this issue， we propose a novel method for efficient PV cell defect detection. Firstly， we utilize Contrast Limited Adaptive Histogram Equalization (CLAHE) algorithm to improve EL image contrast， making defect features become more distinguishable. Secondly， we propose a lightweight defect detector using EfficientNet-B0 as its backbone. Moreover， we design a graph channel attention module (GCAM) to improve CNN’s limitation in modeling global information. It executes graph channel reasoning to generate enriched feature representation beyond the local receptive field， which is beneficial for distinguishing PV cell defects with similar local details. Next， we utilize focal loss to train the detector， enhancing its ability to detect challenging defects. Lastly， the proposed method is evaluated on the PVEL dataset and it achieved an accuracy of 97.81%， precision of 97.70%， recall of 97.59%， F1-score of 97.64%， and MCC of 97.32%， demonstrating our method is effective and outperforms state-of-the-art methods across various metrics.',\n",
       "  'publicationTitle': 'Solar Energy',\n",
       "  'date': '2024-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X23008794',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['CLAHE',\n",
       "   'Electroluminescence imaging',\n",
       "   'Focal loss',\n",
       "   'Graph channel attention module',\n",
       "   'Lightweight defect detector',\n",
       "   'Photovoltaic cell defect detection']},\n",
       " {'title': 'Use of Drone and Infrared Camera for a Campus Building Envelope Study',\n",
       "  'creators': 'AriwoolaRaheem',\n",
       "  'abstractNote': '',\n",
       "  'publicationTitle': 'Electronic Theses and Dissertations',\n",
       "  'date': '2016-05-01',\n",
       "  'language': '',\n",
       "  'url': 'https://dc.etsu.edu/etd/3018',\n",
       "  'libraryCatalog': '',\n",
       "  'tags': []},\n",
       " {'title': 'UAV Remote Sensing Image Dehazing Based on Double-Scale Transmission Optimization Strategy',\n",
       "  'creators': 'ZhangKemeng',\n",
       "  'abstractNote': 'Current dehazing methods for unmanned aerial vehicle (UAV) remote sensing images often have texture detail loss and color distortion problems， especially in highlighted regions. This is mainly due to the rich texture and low intensity of UAV remote sensing images being ignored， which results in incorrect transmission estimation. In this letter， we propose a UAV remote sensing image dehazing method based on a double-scale transmission optimization strategy. First， we propose a double-scale optimization strategy to estimate the transmission map with more accurate texture details and color preservation， especially in highlighted regions of hazy UAV images that are most severely distorted. Second， a UAV-adaptive haze-line prior algorithm is proposed to address the large scene depth and low intensity of UAV remote sensing images. Finally， we introduce a luminance-weighted frequency-domain saliency model to avoid texture detail loss and color distortions for better transmission optimization， especially in highlighted regions. Compared with state-of-the-art methods， our method shows better detail performance and visual effects， especially for UAV images with highlighted regions.',\n",
       "  'publicationTitle': 'IEEE Geoscience and Remote Sensing Letters',\n",
       "  'date': '2022',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9888129',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Autonomous aerial vehicles',\n",
       "   'Dehazing',\n",
       "   'Distortion',\n",
       "   'Feature extraction',\n",
       "   'Image color analysis',\n",
       "   'Optimization',\n",
       "   'Propagation losses',\n",
       "   'Remote sensing',\n",
       "   'remote sensing',\n",
       "   'saliency detection',\n",
       "   'transmission optimization',\n",
       "   'unmanned aerial vehicle (UAV)']},\n",
       " {'title': 'Safety Performance of Unsignalized Median U-Turn Intersections',\n",
       "  'creators': 'KayJonathan',\n",
       "  'abstractNote': 'Alternative intersection designs can offer safety and operational benefits with potentially lower costs than conventional intersections when implemented in the proper setting. The Federal Highway Administration has previously identified a subset of alternative designs called reduced left-turn conflict intersections as a proven safety countermeasure. Median U-turn intersections (also known as “Michigan lefts” or “boulevard turnarounds”) are one such design that accommodates all left-turn movements via directional U-turn crossovers within the median. Prior work has consistently shown that median U-turn intersections can provide superior safety performance when used in the appropriate conditions. However， research that is specific to unsignalized reduced left-turn conflict intersections continues to be limited to work conducted before the Highway Safety Manual， or which includes restricted crossing U-turn intersections. This study included the evaluation of historical traffic crashes and volume data at 95 unsignalized intersections in the state of Michigan. This included the collection of data for 39 median U-turn sites and 56 reference group sites to estimate safety performance functions and crash modification factors that can be used when considering future conversions. Ultimately， crash modification factors for fatal and injury crashes of 0.438 and 0.686 are recommended when converting intersections with undivided two-lane two-way major approaches and four-lane divided boulevard major approaches， respectively. Although there was no significant difference in property damage only crashes associated with converting intersections with undivided， two-lane， two-way major approaches， a crash modification factor of 1.325 is recommended for property damage only crashes specific to conversions with four-lane， divided boulevard major approaches.',\n",
       "  'publicationTitle': 'Transportation Research Record: Journal of the Transportation Research Board',\n",
       "  'date': '2022-04-29',\n",
       "  'language': '',\n",
       "  'url': '',\n",
       "  'libraryCatalog': 'ResearchGate',\n",
       "  'tags': []},\n",
       " {'title': 'Safety Performance of Unsignalized Median U-Turn Intersections',\n",
       "  'creators': 'KayJonathan',\n",
       "  'abstractNote': 'Alternative intersection designs can offer safety and operational benefits with potentially lower costs than conventional intersections when implemented in the proper setting. The Federal Highway Administration has previously identified a subset of alternative designs called reduced left-turn conflict intersections as a proven safety countermeasure. Median U-turn intersections (also known as “Michigan lefts” or “boulevard turnarounds”) are one such design that accommodates all left-turn movements via directional U-turn crossovers within the median. Prior work has consistently shown that median U-turn intersections can provide superior safety performance when used in the appropriate conditions. However， research that is specific to unsignalized reduced left-turn conflict intersections continues to be limited to work conducted before the Highway Safety Manual， or which includes restricted crossing U-turn intersections. This study included the evaluation of historical traffic crashes and volume data at 95 unsignalized intersections in the state of Michigan. This included the collection of data for 39 median U-turn sites and 56 reference group sites to estimate safety performance functions and crash modification factors that can be used when considering future conversions. Ultimately， crash modification factors for fatal and injury crashes of 0.438 and 0.686 are recommended when converting intersections with undivided two-lane two-way major approaches and four-lane divided boulevard major approaches， respectively. Although there was no significant difference in property damage only crashes associated with converting intersections with undivided， two-lane， two-way major approaches， a crash modification factor of 1.325 is recommended for property damage only crashes specific to conversions with four-lane， divided boulevard major approaches.',\n",
       "  'publicationTitle': 'Transportation Research Record: Journal of the Transportation Research Board',\n",
       "  'date': '2022-04-29',\n",
       "  'language': '',\n",
       "  'url': '',\n",
       "  'libraryCatalog': 'ResearchGate',\n",
       "  'tags': []},\n",
       " {'title': 'PVF-10: A high-resolution unmanned aerial vehicle thermal infrared image dataset for fine-grained photovoltaic fault classificationPVF-10： 用于精细光伏故障分类的高分辨率无人机热红外图像数据集',\n",
       "  'creators': 'WangBo',\n",
       "  'abstractNote': 'Accurate identification of faulty photovoltaic (PV) modules is crucial for the effective operation and maintenance of PV systems. Deep learning (DL) algorithms exhibit promising potential for classifying PV fault (PVF) from thermal infrared (TIR) images captured by unmanned aerial vehicle (UAV)， contingent upon the availability of extensive and high-quality labeled data. However， existing TIR PVF datasets are limited by low image resolution and incomplete coverage of fault types. This study proposes a high-resolution TIR PVF dataset with 10 classes， named PVF-10， comprising 5579 cropped images of PV panels collected from 8 PV power plants. These classes are further categorized into two groups according to the repairability of PVF， with 5 repairable and 5 irreparable classes each. Additionally， the circuit mechanisms underlying the TIR image features of typical PVF types are analyzed， supported by high-resolution images， thereby providing comprehensive information for PV operators. Finally， five state-of-the-art DL algorithms are trained and validated based on the PVF-10 dataset using three levels of resampling strategy. The results show that the overall accuracy (OA) of these algorithms exceeds 83%， with the highest OA reaching 93.32%. Moreover， the preprocessing procedure involving resampling and padding strategies are beneficial for improving PVF classification accuracy using PVF-10 datasets. The developed PVF-10 dataset is expected to stimulate further research and innovation in PVF classification.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2024-12-15',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0306261924015708',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Classification',\n",
       "   'Deep learning',\n",
       "   'Photovoltaic fault',\n",
       "   'Thermal infrared data',\n",
       "   'Unmanned aerial vehicle']},\n",
       " {'title': 'Optimal time trajectory and coordination for connected and automated vehicles',\n",
       "  'creators': 'MalikopoulosAndreas A.',\n",
       "  'abstractNote': 'In this paper， we provide a decentralized theoretical framework for coordination of connected and automated vehicles (CAVs) at different traffic scenarios. The framework includes: (1) an upper-level optimization that yields for each CAV its optimal time trajectory and lane to pass through a given traffic scenario while alleviating congestion; and (2) a low-level optimization that yields for each CAV its optimal control input (acceleration/deceleration). We provide a complete， analytical solution of the low-level optimization problem that includes the rear-end， speed-dependent safety constraint. Furthermore， we provide a problem formulation for the upper-level optimization in which there is no duality gap. The latter implies that the optimal time trajectory for each CAV does not activate any of the state， control， and safety constraints of the low-level optimization， thus allowing for online implementation. Finally， we present a geometric duality framework with hyperplanes to derive the condition under which the optimal solution of the upper-level optimization always exists. We validate the effectiveness of the proposed theoretical framework through simulation.',\n",
       "  'publicationTitle': 'Automatica',\n",
       "  'date': '2021-03-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0005109820306671',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Autonomous intersections',\n",
       "   'Connected and automated vehicles',\n",
       "   'Cyber–physical systems',\n",
       "   'Decentralized optimal control',\n",
       "   'Emerging mobility',\n",
       "   'Path planning']},\n",
       " {'title': 'Optimal traffic operation for maximum energy efficiency in signal-free urban networks: A macroscopic analytical approach',\n",
       "  'creators': 'AmirgholyMahyar',\n",
       "  'abstractNote': 'The integration of artificial intelligence and wireless communication technologies in communicant autonomous vehicles (CAVs) enables coordinating the movement of CAV platoons at signal-free intersections. The capacity of signal-free intersections can be significantly improved by adjusting traffic variables at a macroscopic scale; however， the resulting improvement in the capacity does not necessarily have a positive impact on the energy consumption of CAVs at the network level. In this research， we develop an analytical model to enhance energy efficiency by optimizing macroscopic traffic variables in signal-free networks. To this end， we adopt a macroscopic modeling approach to estimate the operational capacity by accounting for the stochasticity resulting from the error in synchronizing the arrival and departure of consecutive platoons in crossing directions at intersections. We also develop a macrolevel analytical model to estimate expected energy loss during the acceleration/deceleration maneuver required for resynchronization at intersections as a function of synchronization success probability. We then maximize energy efficiency by minimizing expected energy loss and maximizing expected capacity in a biobjective optimization framework. We solve the energy efficiency problem using an analytical approach to derive a closed-form solution for the optimal traffic speed and the length of the marginal gap between the passage of consecutive platoons in crossing directions through intersections for a (general) normal distribution of the operational error. Having the closed-form solution of the energy efficiency problem， we balance the trade-off between energy loss and operational capacity at a large scale by extending the analytical model to the network level using the Macroscopic Fundamental Diagram (MFD) concept. The results of our two-ring simulation model indicate the accuracy of the proposed analytical model in estimating the macroscopic relationship between the expected energy loss at intersections and the vehicular density in signal-free networks. Our numerical results also show that optimizing the traffic speed and marginal gap length can improve energy efficiency by 31% at the cost of a 16% decrease in maximum capacity.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2023-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S030626192201385X',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Autonomous vehicles',\n",
       "   'Biobjective optimization',\n",
       "   'Closed-form solution',\n",
       "   'Energy efficiency',\n",
       "   'Macroscopic fundamental diagram']},\n",
       " {'title': 'Optimal Coordination of Platoons of Connected and Automated Vehicles at Signal-Free Intersections',\n",
       "  'creators': 'KumaravelSharmila Devi',\n",
       "  'abstractNote': 'In this paper， we address the problem of coordinating platoons of connected and automated vehicles crossing a signal-free intersection. We present a decentralized， two-level optimal framework to coordinate the platoons with the objective to minimize travel delay and fuel consumption of every platoon crossing the intersection. At the upper-level， each platoon leader derives a proven optimal schedule to enter the intersection. At the low-level， the platoon leader derives their optimal control input (acceleration/deceleration) for the optimal schedule derived in the upper-level. We validate the effectiveness of the proposed framework in simulation and show significant improvements both in travel delay and fuel consumption compared to the baseline scenarios where platoons enter the intersection based on first-come-first-serve and longest queue first - maximum weight matching scheduling algorithms.',\n",
       "  'publicationTitle': 'IEEE Transactions on Intelligent Vehicles',\n",
       "  'date': '2022-06',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9484798',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Delays',\n",
       "   'Fuels',\n",
       "   'Merging',\n",
       "   'Optimal control',\n",
       "   'Optimal scheduling',\n",
       "   'Platoons coordination',\n",
       "   'Schedules',\n",
       "   'Scheduling',\n",
       "   'connected and automated vehicles',\n",
       "   'intersection control']},\n",
       " {'title': 'Optimal Control for Connected and Autonomous Vehicles at Signal-Free Intersections',\n",
       "  'creators': 'ChenBoli',\n",
       "  'abstractNote': 'The development of connected and autonomous vehicles (CAVs) is one of the central aspects in the pathway towards future intelligent mobility systems. This paper addresses the problem of coordinating CAVs crossing an uncontrolled intersection so as to maintain safe and efficient traffic flow. The proposed control strategy is based on an optimal control framework that is formulated to minimize a weighted sum of total energy consumption and travel time of all CAVs by finding the optimal velocity trajectory of each vehicle. The design procedure starts with a proper formulation of the autonomous intersection crossing problem for CAVs， with various cases of energy recovery capability by the CAVs considered， to also investigate the influence of powertrain electrification on the intersection crossing problem. This yields an optimal control problem (OCP) with nonlinear and nonconvex dynamics and constraints. In order to ensure a rapid solution search and a unique global optimum， the OCP is reformulated via convex modeling techniques. Numerical results validate the effectiveness of the proposed approaches， while the trade-off between energy consumption and travel time is illustrated by Pareto optimal solutions.',\n",
       "  'publicationTitle': 'IFAC-PapersOnLine',\n",
       "  'date': '2020-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S2405896320330056',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Connected',\n",
       "   'Energy consumption',\n",
       "   'Intersections crossing',\n",
       "   'Optimization',\n",
       "   'Velocity control',\n",
       "   'autonomous vehicles']},\n",
       " {'title': 'Optimal Cooperative Driving at Signal-Free Intersections With Polynomial-Time Complexity',\n",
       "  'creators': 'PeiHuaxin',\n",
       "  'abstractNote': 'Cooperative driving at signal-free intersections， which aims to improve driving safety and efficiency for connected and automated vehicles， has attracted increasing interest in recent years. However， existing cooperative driving strategies either suffer from computational complexity or cannot guarantee global optimality. To fill this research gap， this paper proposes an optimal and computationally efficient cooperative driving strategy with the polynomial-time complexity. By modeling the conflict relations among the vehicles， the solution space of the cooperative driving problem is completely represented by a newly designed small-size state space. Then， based on dynamic programming， the globally optimal solution can be searched inside the state space efficiently. It is proved that the proposed strategy can reduce the time complexity of computation from exponential to a small-degree polynomial. Simulation results further demonstrate that the proposed strategy can obtain the globally optimal solution within a limited computation time under various traffic demand settings.',\n",
       "  'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems',\n",
       "  'date': '2022-08',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9569746',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Connected and automated vehicles',\n",
       "   'Dynamic programming',\n",
       "   'Merging',\n",
       "   'Safety',\n",
       "   'Simulation',\n",
       "   'Space vehicles',\n",
       "   'Time complexity',\n",
       "   'Vehicle dynamics',\n",
       "   'cooperative driving',\n",
       "   'dynamic programming',\n",
       "   'signal-free intersection']},\n",
       " {'title': 'Modeling vehicle U-turning behavior near intersections: A deep learning approach based on TCN and multi-head attention',\n",
       "  'creators': 'ZengWeiliang',\n",
       "  'abstractNote': 'In U-turn bays near intersections， the conflict between U-turning vehicles and those going straight-ahead results in traffic accidents since straight-ahead vehicles cannot reliably anticipate the behavior of oncoming U-turning vehicles. However， previous studies on modeling U-turning behavior do not effectively capture the spatial–temporal interaction between the U-turning and surrounding vehicles. To address this issue， a deep-learning framework based on a temporal convolutional network (TCN) and multi-head attention mechanism is developed. The TCN is utilized to capture long-term dependencies of vehicles in the shared left- and U-turn lanes by extracting vehicle historical motion features. The self-attention mechanism extracts salient features related to the U-turn intentions， classifying the vehicles into left- and U-turning vehicles based on their driving intentions. A parallel TCN and spatial multi-head attention structure is constructed to model vehicle–vehicle interactions to further predict the future trajectory of U-turning vehicles. Finally， the obtained features are input into a Transformer-based decoder module and trajectory generator to predict the future displacement and body orientation of U-turning vehicles. The model is validated via comparison with state-of-the-art models and the observed trajectories under various scenarios. Ablation studies are conducted to quantify the efficacy of each module. Further， the effect of the surrounding homogenous and heterogeneous vehicles on U-turning vehicles in four different U-turn scenarios is quantified using spatial–temporal variation graphs and attention matrices.',\n",
       "  'publicationTitle': 'Expert Systems with Applications',\n",
       "  'date': '2024-09-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0957417424005402',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Deep learning',\n",
       "   'Driving intention',\n",
       "   'Multi-head attention mechanism',\n",
       "   'Trajectory prediction',\n",
       "   'Vehicle U-turning behavior']},\n",
       " {'title': 'Methods of photovoltaic fault detection and classification: A review',\n",
       "  'creators': 'HongYing-Yi',\n",
       "  'abstractNote': 'Photovoltaic (PV) fault detection and classification are essential in maintaining the reliability of the PV system (PVS). Various faults may occur in either DC or AC side of the PVS. The detection， classification， and localization of such faults are essential for mitigation， accident prevention， reduction of the loss of generated energy， and revenue. In recent years， the number of works of PV fault detection and classification has significantly increased. These works have been reviewed by considering the categorization of detection and classification techniques. This paper improves of the categorization of methods to study the faulty PVS by considering visual and thermal method and electrical based method. Moreover， an effort is made to list all potential faults in a PVS in both the DC and AC sides. Specific PV fault detection and classification techniques are also enumerated. A possible direction for research on the PV fault detection and classification， such as quantum machine learning， internet of things， and cloud/edge computing technologies， is suggested as a guide for future emerging technologies.',\n",
       "  'publicationTitle': 'Energy Reports',\n",
       "  'date': '2022-11-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S2352484722008022',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Classification',\n",
       "   'Detection',\n",
       "   'Electrical based method',\n",
       "   'Fault',\n",
       "   'Photovoltaic system',\n",
       "   'Visual and thermal method']},\n",
       " {'title': 'Evaluating the Impacts of Different Exit Strategies of Emergency Vehicle Preemption on Arterial Signal Coordination: A Case Study in Reno, Nevada',\n",
       "  'creators': 'XuJianyuan',\n",
       "  'abstractNote': 'AbstractEmergency vehicle preemption (EVP)， a common traffic signal preemption in urban areas，\\nis used to prioritize the right-of-way to emergency vehicles at signalized intersections\\nby terminating active signal timing plans and running preemption plans. ...Practical ApplicationsTraffic signal preemption systems play an essential role in emergency response management\\nin terms of shortened response times， improved traffic safety， and potential cost\\nsavings. As one of the common traffic signal preemptions in ...',\n",
       "  'publicationTitle': 'Journal of Transportation Engineering, Part A: Systems',\n",
       "  'date': '2023/11/01',\n",
       "  'language': 'EN',\n",
       "  'url': 'https://ascelibrary.org/doi/10.1061/JTEPBS.TEENG-7819',\n",
       "  'libraryCatalog': 'ASCE',\n",
       "  'tags': []},\n",
       " {'title': 'Merging process of U-turns at uncontrolled median openings under mixed traffic conditions',\n",
       "  'creators': 'SilGourab',\n",
       "  'abstractNote': 'At an uncontrolled median opening， the limited priority situation and the high degree of heterogeneity in traffic stream make the merging manoeuvre of U-turning vehicles very much complex. This study is an attempt to understand this merging manoeuvre. The different types of merging manoeuvres have been identified in the field and accordingly classified into different categories. Depending upon the number of vehicles that can merge all together into the opposing through traffic by accepting a single gap， the merging has been classified into two types: single entry merging and multiple entry merging. On the other hand， based on the situation of priority of movement， the merging process is divided into another two categories: ideal merging and forced merging. More explicitly， the ideal merging is split into free merging and Swift Merging (SM). In addition， the forced entry merging is categorized into Gradual Merging (GM) and Aggressive Merging (AM). Time distance diagrams for different types of merging are presented for their better understanding. Field data collected at seven median openings located on various 6-lane divided urban roads are used to analyse different types of merging in a mixed traffic situation. All vehicles plying on the road are divided into 5 categories such as car， motorized two-wheeler (2-W)， motorized three-wheeler (3-W)， Sports Utility Vehicle (SUV)， and Light Commercial Vehicle (LCV) and the merging behaviour of these categories of vehicles have been studied. The effect of influencing parameters like opposing traffic volume and delay on merging are investigated. Mathematical relations are developed between Merging Time (MT) of a vehicle type and the opposing traffic volume. To address the effect of Service Delay (SD) on the MT of a vehicle， models are proposed between SD and MT for all the five categories of vehicles. The two types of merging; gradual and swift are prominently observed in field. The time required by different categories of vehicles for these two merging at various traffic volume levels are determined. Finally， two-tailed t-test is conducted to see if the MT for the two different types of merging is statistically different.\\nFirst published online 26 October 2016\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tKeyword : \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmerging， \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmedian opening， \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmixed traffic， \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tservice delay， \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\topposing traffic\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\tHow to Cite\\n\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n  Sil， G.， Mohapatra， S. S.， Dey， P. P.， & Chandra， S. (2018). Merging process of U-turns at uncontrolled median openings under mixed traffic conditions. Transport， 33(2)， 370–379. https://doi.org/10.3846/16484142.2016.1247295\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tMore Citation Formats\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tACM\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tACS\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tAPA\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tABNT\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tChicago\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tHarvard\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tIEEE\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tMLA\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tTurabian\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tVancouver',\n",
       "  'publicationTitle': 'Transport',\n",
       "  'date': '2018-01-26',\n",
       "  'language': 'en',\n",
       "  'url': 'https://journals.vilniustech.lt/index.php/Transport/article/view/193',\n",
       "  'libraryCatalog': 'journals.vilniustech.lt',\n",
       "  'tags': ['median opening',\n",
       "   'merging',\n",
       "   'mixed traffic',\n",
       "   'opposing traffic',\n",
       "   'service delay']},\n",
       " {'title': 'Intelligent vehicle control at signal-free intersection under mixed connected environment',\n",
       "  'creators': 'YangHao',\n",
       "  'abstractNote': '<em>IET Intelligent Transport Systems</em> is an interdisciplinary journal publishing research on the practical applications of intelligent transport systems and infrastructure.',\n",
       "  'publicationTitle': 'IET Intelligent Transport Systems',\n",
       "  'date': '2020/02/01',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ietresearch.onlinelibrary.wiley.com/doi/10.1049/iet-its.2019.0175',\n",
       "  'libraryCatalog': 'ietresearch.onlinelibrary.wiley.com',\n",
       "  'tags': []},\n",
       " {'title': 'Emergency vehicle route oriented signal coordinated control model with two-level programming',\n",
       "  'creators': 'YaoJiao',\n",
       "  'abstractNote': 'To minimize travel time of emergency vehicles on the way and improve efficiency of emergency response， an emergency vehicle route oriented signal coordinated control model with two-level programming was proposed based on the different priority types and priority levels of emergency vehicles. The upper level is the dynamic offset model of emergency vehicles， and the lower level is the green wave model of emergency vehicles. At dynamic offset level， latter phase was calculated based on the queue length ahead of the emergency vehicles and their arrival time， in which the former phase was the reference object. At route green wave level， maximum bandwidth of the route of emergency vehicles was calculated， based on the turning movement characteristics and its corresponding capacity reduction. Furthermore， the two-level programming model solution is obtained with genetic algorithm. Finally， simulation results of three control strategies， which are no-signal priority control strategy， isolated control priority strategy and coordinated priority control strategy in this paper， were obtained in micro-traffic simulation software VISSIM， with the case including three intersections in Suzhou roads as the emergency vehicles route. From the simulation results we can conclude that compared to no-signal priority control strategy， coordinated priority strategy can reduce delay， travel time， queue length and stops of emergency vehicles by 27，18， 36 and 38%， respectively， and the average delay of total vehicles at intersection can be reduced by 20%; compared to isolated control priority strategy， these numbers are 14， 6， 12， 21 and 22%， respectively， which means great improvement， and influence on social background traffic was also considered in it.',\n",
       "  'publicationTitle': 'Soft Computing',\n",
       "  'date': '2018-07-01',\n",
       "  'language': 'en',\n",
       "  'url': 'https://doi.org/10.1007/s00500-017-2826-x',\n",
       "  'libraryCatalog': 'Springer Link',\n",
       "  'tags': ['Artificial Intelligence',\n",
       "   'Coordinated control',\n",
       "   'Dynamic offset of phase',\n",
       "   'Emergency traffic',\n",
       "   'Genetic algorithm',\n",
       "   'Green wave',\n",
       "   'Route',\n",
       "   'Two-level programming']},\n",
       " {'title': 'Distributed conflict-free cooperation for multiple connected vehicles at unsignalized intersections',\n",
       "  'creators': 'XuBiao',\n",
       "  'abstractNote': 'Connected vehicles will change the modes of future transportation management and organization， especially at intersections. In this paper， we propose a distributed conflict-free cooperation method for multiple connected vehicles at unsignalized intersections. We firstly project the approaching vehicles from different traffic movements into a virtual lane and introduce a conflict-free geometry topology considering the conflict relationship of involved vehicles， thus constructing a virtual platoon. Then we present the modeling of communication topology to describe two modes of information transmission between vehicles. Finally， a distributed controller is designed to stabilize the virtual platoon for conflict-free cooperation at intersections. Numerical simulations validate the effectiveness of this method.',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2018-08-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X18308246',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Connected and automated vehicle',\n",
       "   'Cooperative control',\n",
       "   'Unsignalized intersection',\n",
       "   'Virtual platoon']},\n",
       " {'title': 'Decision making of autonomous vehicles in lane change scenarios: Deep reinforcement learning approaches with risk awareness',\n",
       "  'creators': 'Unknown Author',\n",
       "  'abstractNote': 'Driving safety is the most important element that needs to be considered for autonomous vehicles (AVs). To ensure driving safety， we proposed a lane c…',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2022/01/01',\n",
       "  'language': 'en-US',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X21004411',\n",
       "  'libraryCatalog': 'www.sciencedirect.com',\n",
       "  'tags': []},\n",
       " {'title': 'Development of a signal-head-free intersection control logic in a fully connected and autonomous vehicle environment',\n",
       "  'creators': 'MirheliAmir',\n",
       "  'abstractNote': 'Establishment of effective cooperation between vehicles and transportation infrastructure improves travel reliability in urban transportation networks. Lack of collaboration， however， exacerbates congestion due mainly to frequent stops at signalized intersections. It is beneficial to develop a control logic that collects basic safety message from approaching connected and autonomous vehicles and guarantees efficient intersection operations with safe and incident free vehicle maneuvers. In this paper， a signal-head-free intersection control logic is formulated into a dynamic programming model that aims to maximize the intersection throughput. A stochastic look-ahead technique is proposed based on Monte Carlo tree search algorithm to determine the near-optimal actions (i.e.， acceleration rates) over time to prevent movement conflicts. Our numerical results confirm that the proposed technique can solve the problem efficiently and addresses the consequences of existing traffic signals. The proposed approach， while completely avoids incidents at intersections， significantly reduces travel time (ranging between 59.4% and 83.7% when compared to fixed-time and fully-actuated control strategies) at intersections under various demand patterns.',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2018-07-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X1830576X',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Autonomous intersection control',\n",
       "   'Connected and autonomous vehicles',\n",
       "   'Control logic',\n",
       "   'Dynamic programming',\n",
       "   'Look-ahead model',\n",
       "   'Monte Carlo tree search']},\n",
       " {'title': 'Decentralized optimal control of Connected Automated Vehicles at signal-free intersections including comfort-constrained turns and safety guarantees',\n",
       "  'creators': 'ZhangYue',\n",
       "  'abstractNote': 'We extend earlier work for optimally controlling Connected Automated Vehicles (CAVs) crossing a signal-free intersection by including all possible turns taken so as to optimize a passenger comfort metric along with energy and travel time minimization. We show that it is possible to achieve this goal in a decentralized manner with each CAV solving an optimal control problem， and derive explicit solutions that guarantee collision avoidance and safe distance constraints within a control zone. We investigate the associated tradeoffs between minimizing energy and vehicle travel time， as well as the passenger comfort metric and include extensive simulations to illustrate this framework.',\n",
       "  'publicationTitle': 'Automatica',\n",
       "  'date': '2019-11-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0005109819304248',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': []},\n",
       " {'title': 'Controllability Analysis and Optimal Control of Mixed Traffic Flow With Human-Driven and Autonomous Vehicles',\n",
       "  'creators': 'WangJiawei',\n",
       "  'abstractNote': 'Connected and automated vehicles (CAVs) have a great potential to improve traffic efficiency in mixed traffic systems， which has been demonstrated by multiple numerical simulations and field experiments. However， some fundamental properties of mixed traffic flow， including controllability and stabilizability， have not been well understood. This paper analyzes the controllability of mixed traffic systems and designs a system-level optimal control strategy. Using the Popov-Belevitch-Hautus (PBH) criterion， we prove for the first time that a ring-road mixed traffic system with one CAV and multiple heterogeneous human-driven vehicles is not completely controllable， but is stabilizable under a very mild condition. Then， we formulate the design of a system-level control strategy for the CAV as a structured optimal control problem， where the CAV’s communication ability is explicitly considered. Finally， we derive an upper bound for reachable traffic velocity via controlling the CAV. Extensive numerical experiments verify the effectiveness of our analytical results and the proposed control strategy. Our results validate the possibility of utilizing CAVs as mobile actuators to smooth traffic flow actively.',\n",
       "  'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems',\n",
       "  'date': '2021-12',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9127876',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Autonomous vehicle',\n",
       "   'Autonomous vehicles',\n",
       "   'Controllability',\n",
       "   'Optimal control',\n",
       "   'Road traffic',\n",
       "   'Stability analysis',\n",
       "   'Vehicle dynamics',\n",
       "   'controllability and stabilizability',\n",
       "   'mixed traffic flow',\n",
       "   'structured optimal control']},\n",
       " {'title': 'COOR-PLT: A hierarchical control model for coordinating adaptive platoons of connected and autonomous vehicles at signal-free intersections based on deep reinforcement learning',\n",
       "  'creators': 'LiDuowei',\n",
       "  'abstractNote': 'Platooning and coordination are two implementation strategies that are frequently proposed for traffic control of connected and autonomous vehicles (CAVs) at signal-free intersections instead of using conventional traffic signals. However， few studies have attempted to integrate both strategies to better facilitate the CAV control at signal-free intersections. To this end， this study proposes a hierarchical control model， named COOR-PLT， to coordinate adaptive CAV platoons at a signal-free intersection based on deep reinforcement learning (DRL). COOR-PLT has a two-layer framework. The first layer uses a centralized control strategy to form adaptive platoons. The optimal size of each platoon is determined by considering multiple objectives (i.e.， efficiency， fairness and energy saving). The second layer employs a decentralized control strategy to coordinate multiple platoons passing through the intersection. Each platoon is labeled with coordinated status or independent status， upon which its passing priority is determined. As an efficient DRL algorithm， Deep Q-network (DQN) is adopted to determine platoon sizes and passing priorities respectively in the two layers. The model is validated and examined on the simulator Simulation of Urban Mobility (SUMO). The simulation results demonstrate that the model is able to: (1) achieve satisfactory convergence performances; (2) adaptively determine platoon size in response to varying traffic conditions; and (3) completely avoid deadlocks at the intersection. By comparison with other control methods， the model manifests its superiority of adopting adaptive platooning and DRL-based coordination strategies. Also， the model outperforms several state-of-the-art methods on reducing travel time and fuel consumption in different traffic conditions.',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2023-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X22003461',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Adaptive platoon',\n",
       "   'Connected and autonomous vehicle (CAV)',\n",
       "   'Deep reinforcement learning',\n",
       "   'Hierarchical control',\n",
       "   'Multi-agent coordination',\n",
       "   'Signal-free intersection']},\n",
       " {'title': 'Comparison of Cooperative Driving Strategies for CAVs at Signal-Free Intersections',\n",
       "  'creators': 'XuHuile',\n",
       "  'abstractNote': 'The properties of cooperative driving strategies for planning and controlling Connected and Automated Vehicles (CAVs) at intersections range from some that achieve highly efficient coordination performance to others whose implementation is computationally fast. This paper comprehensively compares the performance of four representative strategies in terms of travel time， energy consumption， computation time， and fairness under different conditions， including the geometric configuration of intersections， asymmetry in traffic arrival rates， and the relative magnitude of these rates. Our simulation-based study has led to the following conclusions: 1) The Monte Carlo Tree Search (MCTS)-based strategy achieves the best traffic efficiency and has great performance in fuel consumption; 2) MCTS and Dynamic Resequencing (DR) strategies both perform well in all metrics of interest. If the computation budget is adequate， the MCTS strategy is recommended; otherwise， the DR strategy is preferable; 3) An asymmetric intersection has a noticeable impact on the strategies， whereas the influence of the arrival rates can be neglected. When the geometric shape is asymmetrical， the modified First-In-First-Out (FIFO) strategy significantly outperforms the FIFO strategy and works well when the traffic demand is moderate， but their performances are similar in other situations; and 4) Improving traffic efficiency sometimes comes at the cost of fairness， but the DR and MCTS strategies can be adjusted to realize a better trade-off between various performance metrics by appropriately designing their objective functions.',\n",
       "  'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems',\n",
       "  'date': '2022-07',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9406435',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Automation',\n",
       "   'Connected and automated vehicles (CAVs)',\n",
       "   'Measurement',\n",
       "   'Optimal control',\n",
       "   'Real-time systems',\n",
       "   'Safety',\n",
       "   'Systems engineering and theory',\n",
       "   'Vehicle dynamics',\n",
       "   'cooperative driving strategy',\n",
       "   'crossing sequence']},\n",
       " {'title': 'Adaptive Multi-Scale Fusion Blind Deblurred Generative Adversarial Network Method for Sharpening Image Data',\n",
       "  'creators': 'ZhuBaoyu',\n",
       "  'abstractNote': 'Drone and aerial remote sensing images are widely used， but their imaging environment is complex and prone to image blurring. Existing CNN deblurring algorithms usually use multi-scale fusion to extract features in order to make full use of aerial remote sensing blurred image information， but images with different degrees of blurring use the same weights， leading to increasing errors in the feature fusion process layer by layer. Based on the physical properties of image blurring， this paper proposes an adaptive multi-scale fusion blind deblurred generative adversarial network (AMD-GAN)， which innovatively applies the degree of image blurring to guide the adjustment of the weights of multi-scale fusion， effectively suppressing the errors in the multi-scale fusion process and enhancing the interpretability of the feature layer. The research work in this paper reveals the necessity and effectiveness of a priori information on image blurring levels in image deblurring tasks. By studying and exploring the image blurring levels， the network model focuses more on the basic physical features of image blurring. Meanwhile， this paper proposes an image blurring degree description model， which can effectively represent the blurring degree of aerial remote sensing images. The comparison experiments show that the algorithm in this paper can effectively recover images with different degrees of blur， obtain high-quality images with clear texture details， outperform the comparison algorithm in both qualitative and quantitative evaluation， and can effectively improve the object detection performance of blurred aerial remote sensing images. Moreover， the average PSNR of this paper’s algorithm tested on the publicly available dataset RealBlur-R reached 41.02 dB， surpassing the latest SOTA algorithm.',\n",
       "  'publicationTitle': 'Drones',\n",
       "  'date': '2023/2',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mdpi.com/2504-446X/7/2/96',\n",
       "  'libraryCatalog': 'www.mdpi.com',\n",
       "  'tags': ['deep learning',\n",
       "   'drone and aerial remote sensing',\n",
       "   'generative adversarial networks',\n",
       "   'image blur level',\n",
       "   'image deblurring',\n",
       "   'multi-scale',\n",
       "   'object detection']},\n",
       " {'title': 'A Survey on Emergency Vehicle Preemption Methods Based on Routing and Scheduling',\n",
       "  'creators': 'KambleShridevi Jeevan',\n",
       "  'abstractNote': 'Emergency Vehicles (EVs) play a significant role in saving human lives and property damages. Reducing the time delay of emergency vehicles is important to enhance emergency service performance. The preemption methods are powerful strategies that assist emergency vehicles to reach the desired destination quickly by managing the competing normal traffic along the emergency vehicle approaching lane. The EV preemption models pre-clears the vehicles on the EV approaching lane by interrupting the signal timings and boosting EV arrival speed even the road traffic is high. With the assistance of preemption models， the EVs are not stopping or waiting at signalized intersections. Also， the preemption models diminish the vehicle conflict problems on the EV approaching lane. Moreover， the preemption models use different strategies to navigate the EVs on their routes efficiently. Hence， a detailed survey is needed to understand the different preemption strategies and analyze the gaps which are not effectively solved by existing literature. This paper attempts to survey the existing EV preemption methods with detailed discussions. For a clear view， the survey divides the existing preemption models into three types that are routing-based， scheduling-based， and miscellaneous. The survey compares the preemption methods with their advantages and limitations. Further， it analyzes the gaps which are not solved in existing solutions and describe the possible future directions that pave the way for innovating novel realistic preemption solutions.',\n",
       "  'publicationTitle': 'International Journal of Computer Networks and Applications',\n",
       "  'date': '2022-02-28',\n",
       "  'language': 'en',\n",
       "  'url': 'http://www.i-scholar.in/index.php/IJCNA/article/view/211623',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'A Survey of Monte Carlo Tree Search Methods',\n",
       "  'creators': 'BrowneCameron B.',\n",
       "  'abstractNote': \"Monte Carlo tree search (MCTS) is a recently proposed search method that combines the precision of tree search with the generality of random sampling. It has received considerable interest due to its spectacular success in the difficult problem of computer Go， but has also proved beneficial in a range of other domains. This paper is a survey of the literature to date， intended to provide a snapshot of the state of the art after the first five years of MCTS research. We outline the core algorithm's derivation， impart some structure on the many variations and enhancements that have been proposed， and summarize the results from the key game and nongame domains to which MCTS methods have been applied. A number of open research questions indicate that the field is ripe for future work.\",\n",
       "  'publicationTitle': 'IEEE Transactions on Computational Intelligence and AI in Games',\n",
       "  'date': '2012-03',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/6145622',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Artificial intelligence',\n",
       "   'Artificial intelligence (AI)',\n",
       "   'Computers',\n",
       "   'Decision theory',\n",
       "   'Game theory',\n",
       "   'Games',\n",
       "   'Markov processes',\n",
       "   'Monte Carlo methods',\n",
       "   'Monte Carlo tree search (MCTS)',\n",
       "   'bandit-based methods',\n",
       "   'computer Go',\n",
       "   'game search',\n",
       "   'upper confidence bounds (UCB)',\n",
       "   'upper confidence bounds for trees (UCT)']},\n",
       " {'title': 'A lightweight network for photovoltaic cell defect detection in electroluminescence images based on neural architecture search and knowledge distillation',\n",
       "  'creators': 'ZhangJinxia',\n",
       "  'abstractNote': 'Nowadays， the rapid development of photovoltaic(PV) power stations requires increasingly reliable maintenance and fault diagnosis of PV modules in the field. Due to the effectiveness， convolutional neural network (CNN) has been widely used in the existing automatic defect detection of PV cells. However， the parameters of these CNN-based models are very large， which require stringent hardware resources and it is difficult to be applied in actual industrial projects. To solve these problems， we propose a novel lightweight high-performance model for automatic defect detection of PV cells in electroluminescence(EL) images based on neural architecture search and knowledge distillation. To auto-design an effective lightweight model， we introduce neural architecture search to the field of PV cell defect classification for the first time. Since the defect can be any size， we design a proper search structure of network to better exploit the multi-scale characteristic. To improve the overall performance of the searched lightweight model， we further transfer the knowledge learned by the existing pre-trained large-scale model based on knowledge distillation. Different kinds of knowledge are exploited and transferred， including attention information， feature information， logit information and task-oriented information. Experiments have demonstrated that the proposed model achieves the state-of-the-art performance on the public PV cell dataset of EL images under online data augmentation with accuracy of 91.74% and the parameters of 1.85M. The proposed lightweight high-performance model can be easily deployed to the end devices of the actual industrial projects and retain the accuracy.',\n",
       "  'publicationTitle': 'Applied Energy',\n",
       "  'date': '2024-02-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0306261923015489',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Deep learning',\n",
       "   'Defect detection',\n",
       "   'Electroluminescence',\n",
       "   'Knowledge distillation',\n",
       "   'Neural architecture search',\n",
       "   'Photovoltaic cells']},\n",
       " {'title': 'A novel intelligent traffic recovery model for emergency vehicles based on context-aware reinforcement learning',\n",
       "  'creators': 'KianiFarzad',\n",
       "  'abstractNote': 'Management of traffic emergencies has become very popular in recent years. However， timely response to emergencies and recovering from an emergency is an important problem in itself. The strategies in the current studies merely suggest that after an emergency vehicle passes， the state should iterate to the next phase. Therefore， this paper proposes a novel approach for recovering from an emergency situation at an intersection based on real scenarios. The proposed method is a combination of context-aware and Reinforcement Learning (RL) models that predicts better alternatives for different states rather than just iterating to the next phase. In this regard， a new algorithm， named Interrupt Algorithm， is proposed to predict proper actions for recovering the emergency situation. This algorithm uses a Q-learning-based model that learns from traffic context for an emergency situation and chooses viable action from an action set. The recovery actions are categorized as max， min， and avg， respectively. Test results show that our proposed model outperforms traffic flow over than standard single choice recovering action-based approach by approximately 80%. Based on this， it may be more beneficial to choose different actions and therefore， proposed algorithm with the help of RL presents a more dynamic emergency recovery model.',\n",
       "  'publicationTitle': 'Information Sciences',\n",
       "  'date': '2023-01-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0020025522013469',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Emergency situation',\n",
       "   'Intelligent traffic management',\n",
       "   'Q-learning',\n",
       "   'Reinforcement learning',\n",
       "   'Traffic recovery']},\n",
       " {'title': 'A multiagent approach to autonomous intersection management',\n",
       "  'creators': 'DresnerKurt',\n",
       "  'abstractNote': \"Artificial intelligence research is ushering in a new era of sophisticated， mass-market transportation technology. While computers can already fly a passenger jet better than a trained human pilot， people are still faced with the dangerous yet tedious task of driving automobiles. Intelligent Transportation Systems (ITS) is the field that focuses on integrating information technology with vehicles and transportation infrastructure to make transportation safer， cheaper， and more efficient. Recent advances in ITS point to a future in which vehicles themselves handle the vast majority of the driving task. Once autonomous vehicles become popular， autonomous interactions amongst multiple vehicles will be possible. Current methods of vehicle coordination， which are all designed to work with human drivers， will be outdated. The bottleneck for roadway efficiency will no longer be the drivers， but rather the mechanism by which those drivers' actions are coordinated. While open-road driving is a well-studied and more-or-less-solved problem， urban traffic scenarios， especially intersections， are much more challenging.We believe current methods for controlling traffic， specifically at intersections， will not be able to take advantage of the increased sensitivity and precision of autonomous vehicles as compared to human drivers. In this article， we suggest an alternative mechanism for coordinating the movement of autonomous vehicles through intersections. Drivers and intersections in this mechanism are treated as autonomous agents in a multiagent system. In this multiagent system， intersections use a new reservation-based approach built around a detailed communication protocol， which we also present. We demonstrate in simulation that our new mechanism has the potential to significantly outperform current intersection control technology--traffic lights and stop signs. Because our mechanism can emulate a traffic light or stop sign， it subsumes the most popular current methods of intersection control. This article also presents two extensions to the mechanism. The first extension allows the system to control human-driven vehicles in addition to autonomous vehicles. The second gives priority to emergency vehicles without significant cost to civilian vehicles. The mechanism， including both extensions， is implemented and tested in simulation， and we present experimental results that strongly attest to the efficacy of this approach.\",\n",
       "  'publicationTitle': 'J. Artif. Int. Res.',\n",
       "  'date': '三月 1, 2008',\n",
       "  'language': '',\n",
       "  'url': '',\n",
       "  'libraryCatalog': 'ACM Digital Library',\n",
       "  'tags': []},\n",
       " {'title': 'A Feasibility Analysis at Signal-Free Intersections',\n",
       "  'creators': 'TzortzoglouFilippos N.',\n",
       "  'abstractNote': 'In this letter， we address the problem of improving the feasible domain of the solution of a decentralized control framework for coordinating connected and automated vehicles (CAVs) at signal-free intersections. The framework provides the optimal trajectories of CAVs to cross the intersection safely without stop-and-go driving. However， when traffic volume exceeds a certain level， finding a feasible solution for a CAV may become unattainable. We use concepts of numerical interpolation to identify appropriate polynomials that can serve as alternative trajectories of the CAVs， expanding the domain of the feasible CAV trajectories. We select the alternative polynomials through an optimization problem that aims at minimizing jerk. Finally， we demonstrate the efficacy of our approach through numerical simulations.',\n",
       "  'publicationTitle': 'IEEE Control Systems Letters',\n",
       "  'date': '2024',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10551377',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Boundary conditions',\n",
       "   'Connected automated vehicles',\n",
       "   'Cruise control',\n",
       "   'Interpolation',\n",
       "   'Optimization',\n",
       "   'Polynomials',\n",
       "   'Safety',\n",
       "   'Traffic flow',\n",
       "   'Trajectory']},\n",
       " {'title': 'A Conflict Duration Graph-Based Coordination Method for Connected and Automated Vehicles at Signal-Free Intersections',\n",
       "  'creators': 'DengZhiyun',\n",
       "  'abstractNote': 'Previous studies on Connected and Automated Vehicles (CAVs) demonstrated the potential to coordinate the behaviors of multiple connected vehicles for traffic improvements. In this paper， we first propose a Conflict Duration Graph-based (CDG-based) coordination framework to resolve collisions and improve the traffic capacity of signal-free intersections. Secondly， a Speed Control-based Intersection Coordination Model (SICM) is developed to identify complex constraints in multi-vehicle collision scenarios. Thirdly， a geometric Translation-based Intersection Coordination Algorithm (TICA) is proposed to calculate the ideal location of time blocks in CDGs and then obtain the near-optimal design speed in the form of combinatorial optimization. Twelve groups of test scenarios with different traffic volumes were designed and tested on a MATLAB-based simulation platform. Simulation results showed that the proposed method can resolve all the collisions and instruct the vehicles to pass signal-free intersections collaboratively without stopping in low to medium level of congestion.',\n",
       "  'publicationTitle': 'Applied Sciences',\n",
       "  'date': '2020/1',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mdpi.com/2076-3417/10/18/6223',\n",
       "  'libraryCatalog': 'www.mdpi.com',\n",
       "  'tags': ['connected and automated vehicles',\n",
       "   'multi-vehicle collision resolution',\n",
       "   'signal-free intersection',\n",
       "   'traffic coordination method']},\n",
       " {'title': 'A deep learning framework for modelling left-turning vehicle behaviour considering diagonal-crossing motorcycle conflicts at mixed-flow intersections',\n",
       "  'creators': 'YaoRuoyu',\n",
       "  'abstractNote': 'With heterogeneous traffic agents moving at unprotected phase， severe crossing conflicts are witnessed at mixed-flow intersections， especially when left-turning vehicles are confronted with motorcycles. However， for modelling vehicle turning behaviour， potential conflicts involving diagonal-crossing motorcycles are seldom investigated in existing studies. To explore these scenes， we present a novel interaction-aware deep-learning framework. Firstly， a Long Short-Term Memory (LSTM) based network is employed to encode vehicle historical motion features. Secondly， each vehicle’s potential target lanes are identified with a probabilistic method， followed by a pooling module that extracts and summarizes intention features. Thirdly， Graph Attention Network (GAT) and a synthesized network are introduced to model vehicle-vehicle interaction and vehicle-motorcycle interaction respectively. Finally， multiple kinds of obtained features are sent to a LSTM based decoder module， where both future displacement and body orientation of vehicles are predicted. In short-time simulation experiments， average displacement error is reduced by 47.7% and 20.0% compared to baseline and state-of-the-art methods， with ablation studies conducted to quantify the efficacy of each kind of feature. Moreover， regarding recursive simulation， our model shows availability of reproducing lane-selecting and motorcycle-evasive behaviours. Distributions of post-encroachment time further indicate that the proposed framework can serve as a promising method to realize reliable motion planning for autonomous vehicles.',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2021-11-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X21004095',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Deep learning',\n",
       "   'Mixed-flow intersection',\n",
       "   'Trajectory prediction',\n",
       "   'Vehicle behaviour modelling']},\n",
       " {'title': 'A decentralized energy-optimal control framework for connected automated vehicles at signal-free intersections',\n",
       "  'creators': 'MalikopoulosAndreas A.',\n",
       "  'abstractNote': 'We address the problem of optimally controlling connected and automated vehicles (CAVs) crossing an urban intersection without any explicit traffic signaling， so as to minimize energy consumption subject to a throughput maximization requirement. We show that the solution of the throughput maximization problem depends only on the hard safety constraints imposed on CAVs and its structure enables a decentralized optimal control problem formulation for energy minimization. We present a complete analytical solution of these decentralized problems and derive conditions under which feasible solutions satisfying all safety constraints always exist. The effectiveness of the proposed solution is illustrated through simulation which shows substantial dual benefits of the proposed decentralized framework by allowing CAVs to conserve momentum and fuel while also improving travel time.',\n",
       "  'publicationTitle': 'Automatica',\n",
       "  'date': '2018-07-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0005109818301511',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Autonomous intersections',\n",
       "   'Connected and automated vehicles',\n",
       "   'Decentralized optimal control',\n",
       "   'Energy usage',\n",
       "   'Motion planning',\n",
       "   'Safety',\n",
       "   'Traffic flow']},\n",
       " {'title': 'A consensus-based distributed trajectory control in a signal-free intersection',\n",
       "  'creators': 'MirheliAmir',\n",
       "  'abstractNote': 'This paper develops a distributed cooperative control logic to determine conflict-free trajectories for connected and automated vehicles (CAVs) in signal-free intersections. The cooperative trajectory planning problem is formulated as vehicle-level mixed-integer non-linear programs (MINLPs) that aim to minimize travel time of each vehicle and their speed variations， while avoiding near-crash conditions. To push vehicle-level solutions towards global optimality， we develop a coordination scheme between CAVs on conflicting movements. The coordination scheme shares vehicle states (i.e.， location) over a prediction horizon and incorporates such information in CAVs’ respective MINLPs. Therefore， the CAVs will reach consensus through an iterative process and select conflict-free trajectories that minimize their travel time. The numerical experiments quantify the effects of the proposed methodology on traffic safety and performance measures in an intersection. The results show that the proposed distributed coordinated framework converges to near-optimal CAV trajectories with no conflicts in the intersection neighborhood. While the solutions are found in real-time， the comparison to a central intersection control logic for CAVs indicates a maximum marginal objective value of 2.30%. Furthermore， the maximum marginal travel time， throughput， and average speed do not exceed 0.5%， 0.1%， and 0.5%， respectively. The proposed control logic reduced travel time by 43.0–70.5%， and increased throughput and average speed respectively by 0.8–115.6% and 59.1–400.0% compared to an optimized actuated signal control， while eliminating all near-crash conditions.',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2019-03-01',\n",
       "  'language': '',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X18311343',\n",
       "  'libraryCatalog': 'ScienceDirect',\n",
       "  'tags': ['Connected and autonomous vehicles',\n",
       "   'Control logic',\n",
       "   'Cooperative',\n",
       "   'Coordination',\n",
       "   'Distributed algorithm',\n",
       "   'Signal-free']},\n",
       " {'title': '自动驾驶环境下交叉口车辆路径规划与最优控制模型',\n",
       "  'creators': 'WuWei',\n",
       "  'abstractNote': '自动驾驶环境下的交叉口基于车车/车路之间的双向信息交互，\\xa0能保障自动驾驶车辆相互穿插与协作地通过交叉 口，\\xa0而无需信号灯控制.\\xa0因此，\\xa0如何设计高效的面向自动驾驶车辆通行的交叉口管控模型，\\xa0已成为研究的热点.\\xa0已有研究在 建模时，\\xa0均基于自动驾驶车辆在交叉口内部的行驶路径已知并作为模型输入，\\xa0且大多对交叉口内部的冲突点进行简化.\\xa0本文 首先将交叉口空间离散化处理，\\xa0考虑车辆的实际尺寸并面向非常规交叉口，\\xa0使用椭圆曲线建立转弯车辆行驶路径的精确轨 迹方程，\\xa0再通过外边界投影降维法建立轨迹方程和交叉口空间的映射关系.\\xa0建立了基于混合整数线性规划(Mixed\\xa0integer linear\\xa0programming，\\xa0MILP)的自动驾驶交叉口管控模型，\\xa0以交叉口总延误最小为控制目标，\\xa0同时优化车辆在交叉口的最 佳行驶路径和驶入时刻，\\xa0使用AMPL\\xa0(A\\xa0mathematical\\xa0programming\\xa0language)对模型进行编译并使用CPLEX求解器求 解.\\xa0与经典感应控制和先到先服务模型进行对比，\\xa0结果表明，\\xa0本文所提出模型能对车辆进入交叉口的时刻和行驶路径进行双 重优化，\\xa0显著降低自动驾驶车辆通过交叉口的车均延误，\\xa0提高交叉口空间的利用效率.',\n",
       "  'publicationTitle': '',\n",
       "  'date': '2020-9',\n",
       "  'language': '',\n",
       "  'url': '',\n",
       "  'libraryCatalog': '',\n",
       "  'tags': []},\n",
       " {'title': '学业预警知识图谱的构建与应用',\n",
       "  'creators': '闫瑾',\n",
       "  'abstractNote': '针对学业预警体系中“事前事中预防”措施不足、过程化与可视化较低的问题，进行了学业预警知识图谱的构建与应用。首先，通过protégé完成模式层的构建，对知识的数据结构(包含实体、关系、属性)进行了设计，采用了树状结构，使每个子类继承其祖先节点的属性；其次，以事实三元组为单位，存储具体的信息；然后，通过关系型数据库实现数据与本体的映射，对关系型数据库的结构化数据进行知识抽取，通过D2RQ工具将结构化数据转化为三元组数据，存储在SQL中；最后，使用Neo4j图数据库可视化展示，完成学业预警知识图谱的构建。公开数据集实验测试结果表明，所构建的学业预警知识图谱能够对学生学业进行预警，对实体及属性进行校验标注，经过采样标注后得到准确率为94.23%，且时效性良好，系统平均在9 ms后开始传输，并在25 ms后完成，同时在过程化与可视化方面有较大提升，可以实现“事前事中预防”。',\n",
       "  'publicationTitle': '中北大学学报(自然科学版)',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50kyKUrPKilczQCToTb0b8hR90UZ_coCnRG5-mFTh0KJYq5nsDzG2GIu&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['Neo4j',\n",
       "   'Neo4j cademic early warning',\n",
       "   'data layer',\n",
       "   'knowledge graph',\n",
       "   'model layer',\n",
       "   '学业预警',\n",
       "   '数据层',\n",
       "   '模式层',\n",
       "   '知识图谱']},\n",
       " {'title': '《人工智能应用导论》知识图谱构建与应用研究',\n",
       "  'creators': '张军',\n",
       "  'abstractNote': '随着人工智能技术引发新一轮科技革命和社会进步，高职《人工智能应用导论》课程在众多院校相继开设，对其开展知识图谱的构建应用研究逐渐兴起，然而知识图谱的构建需要领域专家共同探讨、手工构建，十分严谨而繁琐，人工智能知识更新又较为快速，目前高职《人工智能应用导论》知识图谱的构建较为缺乏。本文借助机器学习和自然语言处理技术分别对《人工智能应用导论》课程文本资源进行实体识别和关系抽取，接着进行知识融合，最后基于Neo4j图数据库可视化展示并进行知识点推理。实验结果表明：构造后的课程知识图谱涵盖了《人工智能应用导论》课程所有知识点及其关系属性，助力开展知识点学习推理和学习路径推荐研究。',\n",
       "  'publicationTitle': '网络安全技术与应用',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50ntGgPOL7QfrrIhj5ge6CBH4g2IsyPB8asXh8xKI3kodiPsmDcTH8Zw&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['人工智能应用导论课程', '应用研究', '推理实验', '知识图谱']},\n",
       " {'title': '时态知识图谱的推理研究综述',\n",
       "  'creators': '沈英汉',\n",
       "  'abstractNote': '随着社交网络、物端感知等技术快速发展，网络空间中涌现了大量的交互、话题、事件、新闻等数据，蕴含大量动态演化、强时效性的知识.较于忽略知识中时间信息的传统知识图谱，时态知识图谱通过建模知识的时效性以描述动态变化的现实世界，为时间紧耦合的应用提供有效支持.然而，时态知识图谱无法确保涵盖全量知识，知识的缺失严重影响应用性能，需要推理模型自动挖掘新的知识，以解释事物的历史状态，预测未来发展趋势并描述演化规律.由于实际应用的迫切需要，近年来，时态知识图谱的推理研究工作层出不穷，逐渐引起学术界和工业界的广泛关注.本文对近年来时态知识图谱的推理工作进行全面介绍和总结.首先，介绍了时态知识图谱的推理相关概念与问题描述;其次，介绍了面向补全任务的推理模型与面向预测任务的推理模型，对其进行比较分析;之后总结了时态知识图谱推理的数据集、推理任务、相关指标以及应用场景;最后展望时态知识图谱推理的未来研究趋势.综上，本文致力于为时态知识图谱的推理领域研究人员提供具有价值的参考，以推动该领域进一步发展.',\n",
       "  'publicationTitle': '计算机学报',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50n2_FC1ezrNgNLgIgJzEpc4dzL6iDCe0g5IUw98rlIO1hnPzttTlaiP&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['knowledge completion',\n",
       "   'knowledge graph',\n",
       "   'knowledge prediction',\n",
       "   'temporal knowledge reasoning',\n",
       "   '时态知识图谱',\n",
       "   '时态知识推理',\n",
       "   '知识图谱 temporal knowledge graph',\n",
       "   '知识补全',\n",
       "   '知识预测']},\n",
       " {'title': '基于车路协同的城市应急车辆优先控制：概述与展望',\n",
       "  'creators': '张立立王力',\n",
       "  'abstractNote': '面向我国城市常态应急车辆优先通行需求和车路协同智能交通发展的实际情况，总结了应急车辆优先控制发展历程和研究现状，分析并讨论了存在的问题和未来的研究重点。首先，回顾了我国常态应急车辆优先的发展情况；其次，概述了车路协同应急车辆优先控制、结合优先与路径规划的应急车辆优先控制的国内外研究现状；最后，针对当前研究存在的问题进行了讨论并立足车路协同、自动驾驶等新理论与技术的演进总结了常态应急车辆优先领域的研究重点和应着力解决的关键问题。',\n",
       "  'publicationTitle': '科学技术与工程',\n",
       "  'date': '2021-12-16',\n",
       "  'language': 'cn',\n",
       "  'url': 'http://www.stae.com.cn//jsygc/article/abstract/2101973',\n",
       "  'libraryCatalog': 'www.stae.com.cn',\n",
       "  'tags': []},\n",
       " {'title': '基于知识图谱理念的生物医学电子学在线课程建设初步探索',\n",
       "  'creators': '涂华婷',\n",
       "  'abstractNote': '为满足疫情期间在线教育需求，基于知识图谱理念，从知识、问题、能力3个维度完善课程知识网络结构，初步探索生物医学电子学在线课程建设新模式，旨在重构教学资源、完善人才培养方案，实现开放、个性、精准的智慧教育体系创新。',\n",
       "  'publicationTitle': '卫生职业教育',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50loVFp_9WcsUA-WIO5VPFBwqL4Z0qvbQZmVs1z40ce6xS5VghuFE8SX&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['Biomedical electronics',\n",
       "   'Online course',\n",
       "   '在线课程 Knowledge graph',\n",
       "   '生物医学电子学',\n",
       "   '知识图谱']},\n",
       " {'title': '基于博弈论的无信号交叉口冲突消解方法',\n",
       "  'creators': 'Unknown Author',\n",
       "  'abstractNote': '',\n",
       "  'publicationTitle': '重庆理工大学学报（自然科学）',\n",
       "  'date': '2021-11-09',\n",
       "  'language': 'cn',\n",
       "  'url': 'http://clgzk.qks.cqut.edu.cn/CN/abstract/abstract5742.shtml',\n",
       "  'libraryCatalog': 'clgzk.qks.cqut.edu.cn',\n",
       "  'tags': []},\n",
       " {'title': '地理知识图谱下的建筑群空间分布模式推理',\n",
       "  'creators': '唐曾杨',\n",
       "  'abstractNote': '以图结构表达的知识图谱不仅在语义网络的描述与推理中发挥着重要作用，对于空间实体的结构化抽象与空间推理也具有重要意义。空间实体的联系信息在知识图谱中以图的边记录，通过路径探测、子图对齐、模式发现等基于边的知识图谱计算推理，在空间场景认知可发挥重要作用。地理知识图谱是一种对地理概念、实体及其相互关系进行形式化描述的知识系统，既有通用知识的内涵与特点，也有地理知识特定的时空特征，能够将语义模型和时空模型联系起来，描述语义关系、空间关系和时间关系，在地理知识的表达、理解、获取与推理方面有巨大的应用潜力。现有地理知识图谱的研究工作多集中于语义方面，语义关系的抽取与表达比较丰富，可以支持进一步的地理知识语义搜索等功能；然而地理知识图谱在时空模型上的知识表达比较缺乏，现有的空间关系局限在要素之间，很少涉及空间认知中进一步的分布态势、空间格局等，地理知识图谱在空间语义知识方面有待增强。本文基于知识图谱构建原理，以建筑群地理知识图谱构建为例，实现格网型建筑物模式的识别。先将建筑物抽象成实体，表达为图的节点，基于几何邻近分析提取建筑物之间的空间邻域关系，以此构建建筑群地理知识图谱；在此基础上结合建筑物模式识别的领域知识，进一步推理构建其他的空间语义关系，完善地理知识图谱；再将建筑群场景的格网模式表达为知识图谱的规则，在知识图谱上基于NoSQL语言进行推理。结果表明，本文方法能有效提取建筑物格网模式，验证了地理知识图谱在空间推理上的作用和在领域问题研究中的良好适应性，为地理知识图谱在空间认知领域的应用提供了思路。',\n",
       "  'publicationTitle': '地球信息科学学报',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50lqbdNo8z0wC2CMNOSuvHmHLVNnW2rbj7WrmcNNMwN0lMx5P0x-rTEa&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['building cluster pattern recognition',\n",
       "   'geographic entity',\n",
       "   'geographic knowledge',\n",
       "   'grid-pattern',\n",
       "   'spatial cognition',\n",
       "   'spatial reasoning',\n",
       "   'spatial relationship',\n",
       "   '地理实体',\n",
       "   '地理知识',\n",
       "   '建筑群模式识别',\n",
       "   '格网模式',\n",
       "   '知识图谱',\n",
       "   '空间关系',\n",
       "   '空间推理',\n",
       "   '空间认知 knowledge graph']},\n",
       " {'title': '车联网环境下基于间隙优化的无信号交叉口车速控制方法',\n",
       "  'creators': '常玉林１２',\n",
       "  'abstractNote': '',\n",
       "  'publicationTitle': '重庆理工大学学报（自然科学）',\n",
       "  'date': '2021-04-07',\n",
       "  'language': 'cn',\n",
       "  'url': 'http://clgzk.qks.cqut.edu.cn/CN/abstract/abstract5478.shtml',\n",
       "  'libraryCatalog': 'clgzk.qks.cqut.edu.cn',\n",
       "  'tags': []},\n",
       " {'title': '车路协同环境下路段掉头区域车辆协同控制',\n",
       "  'creators': '吴文静',\n",
       "  'abstractNote': '在车辆驾驶安全的前提下，以车速最大为目标研究了车路协同系统(CVIS)环境下掉头区域车辆的协同控制优化方法。分别设置单车连续掉头及车队掉头两种场景进行控制策略的仿真试验。结果表明，本文方法可实现掉头区域车辆的协同控制。最后，以总延误时间、驾驶舒适性为指标评估控制效率，并得到两种控制策略相适用的车头间距的阈值。',\n",
       "  'publicationTitle': '吉林大学学报(工学版)',\n",
       "  'date': '2019',\n",
       "  'language': 'zh-CN',\n",
       "  'url': 'https://chn.oversea.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFD&dbname=CJFDLAST2019&filename=JLGY201904010&uniplatform=OVERSEA&v=Rqj5CTy_0_P9bEcOIEZeD28QjPsgNNgfmYsLUA32D4-pvDqu8Ihih0ZRsrFT_Lbd',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['交通运输系统工程', '协同控制', '路段掉头', '车队控制']},\n",
       " {'title': '车路协同下车队避让紧急车辆的换道引导方法',\n",
       "  'creators': '焦朋朋',\n",
       "  'abstractNote': '为保证紧急车辆更安全、高效地到达紧急事故现场，基于车路协同系统，提出车队避让紧急车辆的换道引导策略。针对目标车道无车辆、有车辆和有车队3种不同场景，分别提出确保紧急车辆快速通过的协同换道策略。通过协同换道策略引导紧急车辆前方行驶的车队和目标车道的车辆改变速度以调整车辆间距，使其满足换道安全距离，依据换道轨迹规划使车队完成换道，并提出紧急车辆发送紧急避让信号的位置方法，计算当不影响紧急车辆的速度情况下，其发送紧急避让信号时与车队尾车的最短距离。利用SUMO交通仿真软件，实现车路协同环境下3种不同场景车队避让紧急车辆的换道引导，并比较目标车道为车队的场景下，车队换道至目标车队的每个空档中(方式A)和车队换道至目标车队的同一个空档中(方式B)2种不同的换道引导策略。研究结果表明:目标车道有车队的场景下，方式B的协同换道时间更短，发送紧急信号的位置距车队尾车82 m，较方式A的87 m更近，对周围车辆影响更小，因此此场景采用方式B的协同换道策略;在目标车道无车辆、有车辆和有车队3种场景下，紧急车辆分别距车队尾车71，71，82 m时发送紧急避让信号，其可以维持期望速度，验证了最短距离与车辆速度的关系式;与未使用换道引导策略的情况相比，紧急车辆的速度提高，延误减少。',\n",
       "  'publicationTitle': '中国公路学报',\n",
       "  'date': '2021',\n",
       "  'language': 'zh-CN',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=Xlf5kQqXAOlKYazyf-ljzuHvhrTeVEbNl3DDX63-odz7HDdASJVAzcJwZJzvVUrEbgD6GyZfk9cS9ZzyxciZ46DMfT_AYFYK8jEUH9NlxflRam0Y7FqsY2p0QywF8Fczsa-06hHpxjDFjdjWaOxBW2ubTwM6KV6y&uniplatform=NZKPT&language=CHS',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['交通工程', '协同换道', '换道引导策略', '紧急避让', '车路协同', '车队编组']},\n",
       " {'title': '车路协同下避让紧急车辆协同换道策略',\n",
       "  'creators': '郝威',\n",
       "  'abstractNote': '为加快紧急车辆抵达事故现场的速度，同时减少紧急车辆优先权对其他车辆的影响，运用车路协同系统，提出避让紧急车辆协同换道策略，通过调整紧急车辆下游车辆位置，实现紧急车辆高效通过路段。以紧急车辆前车（DV）及其相邻目标车道车辆为控制对象，根据相邻车道车辆间距与车车通信范围，搜索DV可换道空间间隙集。以交通流整体恢复稳定时间最小为目标，确定DV换道轨迹和相邻车道协作车辆的速度变化，引导车辆完成协同合流，既能保障车辆安全换道，还能降低换道造成的速度振荡传递。同时，为快速恢复DV换道造成的目标车道车辆速度波动，对上游车辆（UV）采取先进先出规则的换道控制策略。所提协同避让紧急车辆的策略考虑了车辆协同换道对交通流的整体影响，并在原有换道策略的基础上提出了减少速度波动传递的控制方法。案例分析结果表明：采用上下游协同换道策略最短换道时间为6s，此时紧急车辆距前车78.66 m时发送避让信号。同时研究发现，恢复交通流速度稳定所需的时间为29 s，比未采用上下游协同换道策略降低了34%。',\n",
       "  'publicationTitle': '交通信息与安全',\n",
       "  'date': '2022',\n",
       "  'language': 'zh-CN',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=Xlf5kQqXAOnSvBuBHPaFIxyTinGrDINee5jkjVqvswSTlFcQsqwzOr0FFp9R8WmpH5l55BQ3Asxnuh6zFhhtBtXIZexTcE8t-TxouPdY6k2Vkk5NOC6HVs6XZrvx7C32GIX7M055z-wcXkX91S6Tv1MhuM3MkEXR&uniplatform=NZKPT&language=CHS',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['交通工程', '协同换道策略', '紧急避让', '车路协同']},\n",
       " {'title': '高校IT运维知识图谱构建及应用',\n",
       "  'creators': '李建青',\n",
       "  'abstractNote': '针对高校IT运维中出现的效率低下和服务不佳问题，文章研究设计了基于知识图谱的IT运维总体架构。通过分析运维对象、运维服务流程、运维数据、采集分析处理、运维管理等模块，构建了IT运维问答知识图谱，实现了运维场景和孤立知识点的业务应用关联，基于Neo4j和Python技术开发了IT运维问答系统，解决了运维效率低下和服务质量不佳的问题。',\n",
       "  'publicationTitle': '现代信息科技',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50l8yRr-dwAdi5vhgvFP3RlDDHgKkYlPZYTXibJWzFBHw1euGu1vuenZ&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['IT operation and maintenance',\n",
       "   'IT运维',\n",
       "   'Q&A system',\n",
       "   '知识图谱',\n",
       "   '问答系统 knowledge graph']},\n",
       " {'title': '自动车环境下交叉口无信号混合控制策略研究',\n",
       "  'creators': 'Unknown Author',\n",
       "  'abstractNote': '针对交叉口拥挤且主路和支路流量有较大差异的自动车场景下，基于预约的先到先得（First Come First Serve， FCFS）控制策略效率低于信号配时策略的悖论，提出了一种结合FCFS和车队控制的交叉口无信号混合控制策略.在保证安全性的前提下，以延误最低为目标，优化车辆的通行顺序.引入启发式冲突协调算法和取消预约机制，根据车辆所在车道的车流量区分车辆的优先级，车辆根据实时车流量状况自适应地组成车队通过交叉口，从而减少FCFS策略产生的频繁通行权交换，保证车流量大方向车辆通行的连续性.仿真结果表明：在悖论场景下，混合控制策略较FCFS策略能减少55.84%的总延误.当交叉口总体车流量较大且主路与支路车流量差异较明显时，混合控制策略较FCFS策略在减少延误、提高交叉口通行能力方面的优势更明显.',\n",
       "  'publicationTitle': '北京交通大学学报',\n",
       "  'date': '2022/12/15',\n",
       "  'language': 'zh',\n",
       "  'url': 'https://jdxb.bjtu.edu.cn/CN/10.11860/j.issn.1673-0291.20210119',\n",
       "  'libraryCatalog': 'jdxb.bjtu.edu.cn',\n",
       "  'tags': []},\n",
       " {'title': 'A Hierarchical Robust Control Strategy for Decentralized Signal-Free Intersection Management',\n",
       "  'creators': 'PanXiao',\n",
       "  'abstractNote': 'The development of connected and automated vehicles (CAVs) is the key to improving urban mobility safety and efficiency. This article focuses on cooperative vehicle management at a signal-free intersection with consideration of vehicle modeling uncertainties and sensor measurement disturbances. The problem is approached by a hierarchical robust control strategy in a decentralized traffic coordination framework where optimal control and tube-based robust model predictive control methods are designed to hierarchically solve the optimal crossing order and the velocity trajectories of a group of CAVs in terms of energy consumption and throughput. To capture the energy consumption of each vehicle， their powertrain system is modeled in line with an electric drive system. With a suitable relaxation and spatial modeling approach， the optimization problems in the proposed strategy can be formulated as convex second-order cone programs， which provide a unique and computationally efficient solution. A rigorous proof of the equivalence between the convexified and the original problems is also provided. Simulation results illustrate the effectiveness and robustness of the proposed strategy and reveal the impact of traffic density on the control solution. The study of the Pareto optimal solutions for the energy–time objective shows that a minor reduction in journey time can considerably reduce energy consumption， which emphasizes the necessity of optimizing their tradeoff. Finally， the numerical comparisons carried out for different prediction horizons and sampling intervals provide insight into the control design.',\n",
       "  'publicationTitle': 'IEEE Transactions on Control Systems Technology',\n",
       "  'date': '2023-09',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10186085',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Computational modeling',\n",
       "   'Connected and automated vehicles (CAVs)',\n",
       "   'Energy consumption',\n",
       "   'Mechanical power transmission',\n",
       "   'Optimization',\n",
       "   'Predictive models',\n",
       "   'Robust control',\n",
       "   'Uncertainty',\n",
       "   'convex formulation',\n",
       "   'cooperative vehicle management',\n",
       "   'optimization',\n",
       "   'tube-based robust model predictive control (MPC)']},\n",
       " {'title': '《山海经》知识图谱构建与应用研究',\n",
       "  'creators': '朱良兵',\n",
       "  'abstractNote': '[目的/意义]知识图谱作为数智时代一种先进的知识组织方式，能够为数字人文研究提供良好的技术支持，去洞察那些以往在文本资源中看不见的隐含联系和知识结构。[方法/过程]《山海经》是中国上古三大奇书，具有非常高的研究价值。本文引入主题图和Neo4j等技术和工具，在分析《山海经》中的主题类型、关联关系、属性信息的基础上，提出了构建《山海经》知识图谱的技术架构、数据模型、实施步骤。[结果/结论]将多源异构数据进行集成，完成了《山海经》知识图谱的构建和展示，对于将知识图谱技术应用于数字人文研究领域做了有益的探索。',\n",
       "  'publicationTitle': '情报探索',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50niO_CdaBNPNX-0uaDVZU7N5AAnulcHnorXGWxEt2gB0eRbKuKlYbR5&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['association',\n",
       "   'digital humantiy',\n",
       "   'knowledge graph',\n",
       "   'semantic',\n",
       "   'topic map',\n",
       "   '主题图',\n",
       "   '关联',\n",
       "   '山海经',\n",
       "   '数字人文',\n",
       "   '知识图谱',\n",
       "   '语义 The Classic of Mountains and Seas']},\n",
       " {'title': '基于知识图谱的海洋数值预报数据推荐算法',\n",
       "  'creators': '李忠伟',\n",
       "  'abstractNote': '为解决海洋数值预报研究人员面对复杂多样的研究任务时难以及时准确地从种类繁多的海洋数值预报数据中找到所需数据的问题，提出基于知识图谱的海洋数值预报数据推荐算法。利用海洋数值预报文献提取研究任务及海洋数值预报数据构建知识图谱，基于知识图谱计算海洋数值预报数据实体之间的相似度，同时融合在研究人员用户行为下海洋数值预报数据的相似度，进行排序选取相似度较高的海洋数值预报数据进行推荐。实验结果表明，推荐精确率及召回率分别为67.14%、62.49%。',\n",
       "  'publicationTitle': '计算机工程与设计',\n",
       "  'date': '2023',\n",
       "  'language': '中文;',\n",
       "  'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50nW0ftToGaH65dirvJKW0nseKn702sm2gcw_U7v3Bi-WOR-4T8vghXQ&uniplatform=NZKPT',\n",
       "  'libraryCatalog': 'CNKI',\n",
       "  'tags': ['data recommendation',\n",
       "   'knowledge graph',\n",
       "   'marine numerical forecasting literature',\n",
       "   'research task',\n",
       "   'similarity',\n",
       "   'user behavior',\n",
       "   '数据推荐 marine numerical forecast',\n",
       "   '海洋数值预报',\n",
       "   '海洋数值预报文献',\n",
       "   '用户行为',\n",
       "   '相似度',\n",
       "   '知识图谱',\n",
       "   '研究任务']},\n",
       " {'title': 'Coordinated lane-changing scheduling of multilane CAV platoons in heterogeneous scenarios',\n",
       "  'creators': 'Unknown Author',\n",
       "  'abstractNote': 'With the development of sensing， communication and automated driving technology， connected and automated vehicles (CAVs) are becoming promising soluti…',\n",
       "  'publicationTitle': 'Transportation Research Part C: Emerging Technologies',\n",
       "  'date': '2023/02/01',\n",
       "  'language': 'en-US',\n",
       "  'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X22004053',\n",
       "  'libraryCatalog': 'www.sciencedirect.com',\n",
       "  'tags': []},\n",
       " {'title': 'Intelligent PV Module for Grid-Connected PV Systems',\n",
       "  'creators': 'RomanE.',\n",
       "  'abstractNote': 'Most issues carried out about building integrated photovoltaic (PV) system performance show average losses of about 20%-25% in electricity production. The causes are varied， e.g.， mismatching losses， partial shadows， variations in current-voltage (I-V) characteristics of PV modules due to manufacturing processes， differences in the orientations and inclinations of solar surfaces， and temperature effects. These losses can be decreased by means of suitable electronics. This paper presents the intelligent PV module concept， a low-cost high-efficiency dc-dc converter with maximum power point tracking (MPPT) functions， control， and power line communications (PLC). In addition， this paper analyses the alternatives for the architecture of grid-connected PV systems: centralized， string， and modular topologies. The proposed system， i.e.， the intelligent PV module， fits within this last group. Its principles of operation， as well as the topology of boost dc-dc converter， are analyzed. Besides， a comparison of MPPT methods is performed， which shows the best results for the incremental conductance method. Regarding communications， PLC in every PV module and its feasibility for grid-connected PV plants are considered and analyzed in this paper. After developing an intelligent PV module (with dc-dc converter) prototype， its optimal performance has been experimentally confirmed by means of the PV system test platform. This paper describes this powerful tool especially designed to evaluate all kinds of PV systems',\n",
       "  'publicationTitle': 'IEEE Transactions on Industrial Electronics',\n",
       "  'date': '2006-06',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/1667904',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Building integrated photovoltaics',\n",
       "   'Circuit topology',\n",
       "   'Communication system control',\n",
       "   'Communication system fault diagnosis',\n",
       "   'DC-DC power converters',\n",
       "   'Manufacturing processes',\n",
       "   'Power line communications',\n",
       "   'Production systems',\n",
       "   'Programmable control',\n",
       "   'System performance',\n",
       "   'Temperature',\n",
       "   'dc–dc power conversion',\n",
       "   'frequency-shift keying (FSK)',\n",
       "   'photovoltaic (PV) power systems',\n",
       "   'pulsewidth-modulated (PWM) power converters']}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "journalArticle_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ad7390bd-8188-45df-8f9a-a2a51045da67",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:19:31.301805Z",
     "start_time": "2025-07-25T14:19:31.280770Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 要保存的数据列表\n",
    "preprint_data = []\n",
    "for item in all_items:\n",
    "    # 只提取预印本文章，也就是论文的pdf\n",
    "    if 'itemType' in item['data'] and item['data']['itemType'] == 'preprint':\n",
    "        tags = []\n",
    "        for tag in item['data']['tags']:\n",
    "            tags.append(tag['tag'])\n",
    "        if(item['data']['creators']):\n",
    "            creator = item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName']\n",
    "        else:\n",
    "            creator = 'Unknown Author'\n",
    "        preprint_data.append({\n",
    "            'title': item['data']['title'],\n",
    "            'creators': item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName'],\n",
    "            'abstractNote': item['data']['abstractNote'].replace(',', '，'),\n",
    "            # 'publicationTitle': item['data']['publicationTitle'],\n",
    "            'date': item['data']['date'],\n",
    "            'language': item['data']['language'],\n",
    "            'url': item['data']['url'],\n",
    "            'libraryCatalog': item['data']['libraryCatalog'],\n",
    "            'tags': tags\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1090276a9b86e71",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:19:31.672029Z",
     "start_time": "2025-07-25T14:19:31.638224Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'title': 'Denoising Diffusion Probabilistic Models',\n",
       "  'creators': 'HoJonathan',\n",
       "  'abstractNote': 'We present high quality image synthesis results using diffusion probabilistic models， a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics， and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset， we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN， we obtain sample quality similar to ProgressiveGAN. Our implementation is available at https://github.com/hojonathanho/diffusion',\n",
       "  'date': '2020-12-16',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2006.11239',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Machine Learning',\n",
       "   'Statistics - Machine Learning']},\n",
       " {'title': 'StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements',\n",
       "  'creators': 'LeiMingkun',\n",
       "  'abstractNote': 'Text-driven style transfer aims to merge the style of a reference image with content described by a text prompt. Recent advancements in text-to-image models have improved the nuance of style transformations， yet significant challenges remain， particularly with overfitting to reference styles， limiting stylistic control， and misaligning with textual content. In this paper， we propose three complementary strategies to address these issues. First， we introduce a cross-modal Adaptive Instance Normalization (AdaIN) mechanism for better integration of style and text features， enhancing alignment. Second， we develop a Style-based Classifier-Free Guidance (SCFG) approach that enables selective control over stylistic elements， reducing irrelevant influences. Finally， we incorporate a teacher model during early generation stages to stabilize spatial layouts and mitigate artifacts. Our extensive evaluations demonstrate significant improvements in style transfer quality and alignment with textual prompts. Furthermore， our approach can be integrated into existing style transfer frameworks without fine-tuning.',\n",
       "  'date': '2025-03-27',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.08503',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Efficient Diffusion as Low Light Enhancer',\n",
       "  'creators': 'LanGuanzhou',\n",
       "  'abstractNote': 'The computational burden of the iterative sampling process remains a major challenge in diffusion-based Low-Light Image Enhancement (LLIE). Current acceleration methods， whether training-based or training-free， often lead to significant performance degradation， highlighting the trade-off between performance and efficiency. In this paper， we identify two primary factors contributing to performance degradation: fitting errors and the inference gap. Our key insight is that fitting errors can be mitigated by linearly extrapolating the incorrect score functions， while the inference gap can be reduced by shifting the Gaussian flow to a reflectance-aware residual space. Based on the above insights， we design Reflectance-Aware Trajectory Refinement (RATR) module， a simple yet effective module to refine the teacher trajectory using the reflectance component of images. Following this， we introduce \\\\textbf{Re}flectance-aware \\\\textbf{D}iffusion with \\\\textbf{Di}stilled \\\\textbf{T}rajectory (\\\\textbf{ReDDiT})， an efficient and flexible distillation framework tailored for LLIE. Our framework achieves comparable performance to previous diffusion-based methods with redundant steps in just 2 steps while establishing new state-of-the-art (SOTA) results with 8 or 4 steps. Comprehensive experimental evaluations on 10 benchmark datasets validate the effectiveness of our method， consistently outperforming existing SOTA methods.',\n",
       "  'date': '2024-11-21',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2410.12346',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'HVI: A New Color Space for Low-light Image Enhancement',\n",
       "  'creators': 'YanQingsen',\n",
       "  'abstractNote': 'Low-Light Image Enhancement (LLIE) is a crucial computer vision task that aims to restore detailed visual information from corrupted low-light images. Many existing LLIE methods are based on standard RGB (sRGB) space， which often produce color bias and brightness artifacts due to inherent high color sensitivity in sRGB. While converting the images using Hue， Saturation and Value (HSV) color space helps resolve the brightness issue， it introduces significant red and black noise artifacts. To address this issue， we propose a new color space for LLIE， namely Horizontal/Vertical-Intensity (HVI)， defined by polarized HS maps and learnable intensity. The former enforces small distances for red coordinates to remove the red artifacts， while the latter compresses the low-light regions to remove the black artifacts. To fully leverage the chromatic and intensity information， a novel Color and Intensity Decoupling Network (CIDNet) is further introduced to learn accurate photometric mapping function under different lighting conditions in the HVI space. Comprehensive results from benchmark and ablation experiments show that the proposed HVI color space with CIDNet outperforms the state-of-the-art methods on 10 datasets. The code is available at https://github.com/Fediory/HVI-CIDNet.',\n",
       "  'date': '2025-02-28',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2502.20272',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'MonSter: Marry Monodepth to Stereo Unleashes Power',\n",
       "  'creators': 'ChengJunda',\n",
       "  'abstractNote': 'Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues， such as occlusions and textureless areas. To address this， we propose MonSter， a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-branch architecture to iteratively improve each other. Confidence-based guidance adaptively selects reliable stereo cues for monodepth scale-shift recovery. The refined monodepth is in turn guides stereo effectively at ill-posed regions. Such iterative mutual enhancement enables MonSter to evolve monodepth priors from coarse object-level structures to pixel-level geometry， fully unlocking the potential of stereo matching. As shown in Fig.1， MonSter ranks 1st across five most commonly used leaderboards -- SceneFlow， KITTI 2012， KITTI 2015， Middlebury， and ETH3D. Achieving up to 49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. Comprehensive analysis verifies the effectiveness of MonSter in ill-posed regions. In terms of zero-shot generalization， MonSter significantly and consistently outperforms state-of-the-art across the board. The code is publicly available at: https://github.com/Junda24/MonSter.',\n",
       "  'date': '2025-01-15',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.08643',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'MonSter: Marry Monodepth to Stereo Unleashes Power',\n",
       "  'creators': 'ChengJunda',\n",
       "  'abstractNote': 'Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues， such as occlusions and textureless areas. To address this， we propose MonSter， a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-branch architecture to iteratively improve each other. Confidence-based guidance adaptively selects reliable stereo cues for monodepth scale-shift recovery. The refined monodepth is in turn guides stereo effectively at ill-posed regions. Such iterative mutual enhancement enables MonSter to evolve monodepth priors from coarse object-level structures to pixel-level geometry， fully unlocking the potential of stereo matching. As shown in Fig.1， MonSter ranks 1st across five most commonly used leaderboards -- SceneFlow， KITTI 2012， KITTI 2015， Middlebury， and ETH3D. Achieving up to 49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. Comprehensive analysis verifies the effectiveness of MonSter in ill-posed regions. In terms of zero-shot generalization， MonSter significantly and consistently outperforms state-of-the-art across the board. The code is publicly available at: https://github.com/Junda24/MonSter.',\n",
       "  'date': '2025-01-15',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.08643',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos',\n",
       "  'creators': 'HuWenbo',\n",
       "  'abstractNote': 'Estimating video depth in open-world scenarios is challenging due to the diversity of videos in appearance， content motion， camera movement， and length. We present DepthCrafter， an innovative method for generating temporally consistent long depth sequences with intricate details for open-world videos， without requiring any supplementary information such as camera poses or optical flow. The generalization ability to open-world videos is achieved by training the video-to-depth model from a pre-trained image-to-video diffusion model， through our meticulously designed three-stage training strategy. Our training approach enables the model to generate depth sequences with variable lengths at one time， up to 110 frames， and harvest both precise depth details and rich content diversity from realistic and synthetic datasets. We also propose an inference strategy that can process extremely long videos through segment-wise estimation and seamless stitching. Comprehensive evaluations on multiple datasets reveal that DepthCrafter achieves state-of-the-art performance in open-world video depth estimation under zero-shot settings. Furthermore， DepthCrafter facilitates various downstream applications， including depth-based visual effects and conditional video generation.',\n",
       "  'date': '2024-11-27',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2409.02095',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Graphics']},\n",
       " {'title': 'Universal Actions for Enhanced Embodied Foundation Models',\n",
       "  'creators': 'ZhengJinliang',\n",
       "  'abstractNote': 'Training on diverse， internet-scale data is a key factor in the success of recent large foundation models. Yet， using the same recipe for building embodied agents has faced noticeable difficulties. Despite the availability of many crowd-sourced embodied datasets， their action spaces often exhibit significant heterogeneity due to distinct physical embodiment and control interfaces for different robots， causing substantial challenges in developing embodied foundation models using cross-domain data. In this paper， we introduce UniAct， a new embodied foundation modeling framework operating in a Universal Action Space. Our learned universal actions capture the generic atomic behaviors across diverse robots by exploiting their shared structural features， and enable enhanced cross-domain data utilization and cross-embodiment generalizations by eliminating the notorious heterogeneity. The universal actions can be efficiently translated back to heterogeneous actionable commands by simply adding embodiment-specific details， from which fast adaptation to new robots becomes simple and straightforward. Our 0.5B instantiation of UniAct outperforms 14X larger SOTA embodied foundation models in extensive evaluations on various real-world and simulation robots， showcasing exceptional cross-embodiment control and adaptation capability， highlighting the crucial benefit of adopting universal actions. Project page: https://github.com/2toinf/UniAct',\n",
       "  'date': '2025-03-08',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.10105',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Robotics']},\n",
       " {'title': 'Number it: Temporal Grounding Videos like Flipping Manga',\n",
       "  'creators': 'WuYongliang',\n",
       "  'abstractNote': 'Video Large Language Models (Vid-LLMs) have made remarkable advancements in comprehending video content for QA dialogue. However， they struggle to extend this visual understanding to tasks requiring precise temporal localization， known as Video Temporal Grounding (VTG). To address this gap， we introduce Number-Prompt (NumPro)， a novel method that empowers Vid-LLMs to bridge visual comprehension with temporal grounding by adding unique numerical identifiers to each video frame. Treating a video as a sequence of numbered frame images， NumPro transforms VTG into an intuitive process: flipping through manga panels in sequence. This allows Vid-LLMs to \"read\" event timelines， accurately linking visual content with corresponding temporal information. Our experiments demonstrate that NumPro significantly boosts VTG performance of top-tier Vid-LLMs without additional computational cost. Furthermore， fine-tuning on a NumPro-enhanced dataset defines a new state-of-the-art for VTG， surpassing previous top-performing methods by up to 6.9\\\\% in mIoU for moment retrieval and 8.5\\\\% in mAP for highlight detection. The code will be available at https://github.com/yongliang-wu/NumPro.',\n",
       "  'date': '2025-03-21',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.10332',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'SemGeoMo: Dynamic Contextual Human Motion Generation with Semantic and Geometric Guidance',\n",
       "  'creators': 'CongPeishan',\n",
       "  'abstractNote': 'Generating reasonable and high-quality human interactive motions in a given dynamic environment is crucial for understanding， modeling， transferring， and applying human behaviors to both virtual and physical robots. In this paper， we introduce an effective method， SemGeoMo， for dynamic contextual human motion generation， which fully leverages the text-affordance-joint multi-level semantic and geometric guidance in the generation process， improving the semantic rationality and geometric correctness of generative motions. Our method achieves state-of-the-art performance on three datasets and demonstrates superior generalization capability for diverse interaction scenarios. The project page and code can be found at https://4dvlab.github.io/project_page/semgeomo/.',\n",
       "  'date': '2025-03-03',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2503.01291',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass',\n",
       "  'creators': 'YangJianing',\n",
       "  'abstractNote': \"Multi-view 3D reconstruction remains a core challenge in computer vision， particularly in applications requiring accurate and scalable representations across diverse perspectives. Current leading methods such as DUSt3R employ a fundamentally pairwise approach， processing images in pairs and necessitating costly global alignment procedures to reconstruct from multiple views. In this work， we propose Fast 3D Reconstruction (Fast3R)， a novel multi-view generalization to DUSt3R that achieves efficient and scalable 3D reconstruction by processing many views in parallel. Fast3R's Transformer-based architecture forwards N images in a single forward pass， bypassing the need for iterative alignment. Through extensive experiments on camera pose estimation and 3D reconstruction， Fast3R demonstrates state-of-the-art performance， with significant improvements in inference speed and reduced error accumulation. These results establish Fast3R as a robust alternative for multi-view applications， offering enhanced scalability without compromising reconstruction accuracy.\",\n",
       "  'date': '2025-03-19',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.13928',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Graphics',\n",
       "   'Computer Science - Robotics']},\n",
       " {'title': 'StdGEN: Semantic-Decomposed 3D Character Generation from Single Images',\n",
       "  'creators': 'HeYuze',\n",
       "  'abstractNote': 'We present StdGEN， an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images， enabling broad applications in virtual reality， gaming， and filmmaking， etc. Unlike previous methods which struggle with limited decomposability， unsatisfactory quality， and long optimization times， StdGEN features decomposability， effectiveness and efficiency; i.e.， it generates intricately detailed 3D characters with separated semantic components such as the body， clothes， and hair， in three minutes. At the core of StdGEN is our proposed Semantic-aware Large Reconstruction Model (S-LRM)， a transformer-based generalizable model that jointly reconstructs geometry， color and semantics from multi-view images in a feed-forward manner. A differentiable multi-layer semantic surface extraction scheme is introduced to acquire meshes from hybrid implicit fields reconstructed by our S-LRM. Additionally， a specialized efficient multi-view diffusion model and an iterative multi-layer surface refinement module are integrated into the pipeline to facilitate high-quality， decomposable 3D character generation. Extensive experiments demonstrate our state-of-the-art performance in 3D anime character generation， surpassing existing baselines by a significant margin in geometry， texture and decomposability. StdGEN offers ready-to-use semantic-decomposed 3D characters and enables flexible customization for a wide range of applications. Project page: https://stdgen.github.io',\n",
       "  'date': '2025-03-05',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.05738',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': \"h-Edit: Effective and Flexible Diffusion-Based Editing via Doob's h-Transform\",\n",
       "  'creators': 'NguyenToan',\n",
       "  'abstractNote': 'We introduce a theoretical framework for diffusion-based image editing by formulating it as a reverse-time bridge modeling problem. This approach modifies the backward process of a pretrained diffusion model to construct a bridge that converges to an implicit distribution associated with the editing target at time 0. Building on this framework， we propose h-Edit， a novel editing method that utilizes Doob\\'s h-transform and Langevin Monte Carlo to decompose the update of an intermediate edited sample into two components: a \"reconstruction\" term and an \"editing\" term. This decomposition provides flexibility， allowing the reconstruction term to be computed via existing inversion techniques and enabling the combination of multiple editing terms to handle complex editing tasks. To our knowledge， h-Edit is the first training-free method capable of performing simultaneous text-guided and reward-model-based editing. Extensive experiments， both quantitative and qualitative， show that h-Edit outperforms state-of-the-art baselines in terms of editing effectiveness and faithfulness. Our source code is available at https://github.com/nktoan/h-edit.',\n",
       "  'date': '2025-03-04',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2503.02187',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Generative Gaussian Splatting for Unbounded 3D City Generation',\n",
       "  'creators': 'XieHaozhe',\n",
       "  'abstractNote': '3D city generation with NeRF-based methods shows promising generation results but is computationally inefficient. Recently 3D Gaussian Splatting (3D-GS) has emerged as a highly efficient alternative for object-level 3D generation. However， adapting 3D-GS from finite-scale 3D objects and humans to infinite-scale 3D cities is non-trivial. Unbounded 3D city generation entails significant storage overhead (out-of-memory issues)， arising from the need to expand points to billions， often demanding hundreds of Gigabytes of VRAM for a city scene spanning 10km^2. In this paper， we propose GaussianCity， a generative Gaussian Splatting framework dedicated to efficiently synthesizing unbounded 3D cities with a single feed-forward pass. Our key insights are two-fold: 1) Compact 3D Scene Representation: We introduce BEV-Point as a highly compact intermediate representation， ensuring that the growth in VRAM usage for unbounded scenes remains constant， thus enabling unbounded city generation. 2) Spatial-aware Gaussian Attribute Decoder: We present spatial-aware BEV-Point decoder to produce 3D Gaussian attributes， which leverages Point Serializer to integrate the structural and contextual characteristics of BEV points. Extensive experiments demonstrate that GaussianCity achieves state-of-the-art results in both drone-view and street-view 3D city generation. Notably， compared to CityDreamer， GaussianCity exhibits superior performance with a speedup of 60 times (10.72 FPS v.s. 0.18 FPS).',\n",
       "  'date': '2025-02-27',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2406.06526',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Edit Away and My Face Will not Stay: Personal Biometric Defense against Malicious Generative Editing',\n",
       "  'creators': 'WangHanhui',\n",
       "  'abstractNote': 'Recent advancements in diffusion models have made generative image editing more accessible， enabling creative edits but raising ethical concerns， particularly regarding malicious edits to human portraits that threaten privacy and identity security. Existing protection methods primarily rely on adversarial perturbations to nullify edits but often fail against diverse editing requests. We propose FaceLock， a novel approach to portrait protection that optimizes adversarial perturbations to destroy or significantly alter biometric information， rendering edited outputs biometrically unrecognizable. FaceLock integrates facial recognition and visual perception into perturbation optimization to provide robust protection against various editing attempts. We also highlight flaws in commonly used evaluation metrics and reveal how they can be manipulated， emphasizing the need for reliable assessments of protection. Experiments show FaceLock outperforms baselines in defending against malicious edits and is robust against purification techniques. Ablation studies confirm its stability and broad applicability across diffusion-based editing algorithms. Our work advances biometric defense and sets the foundation for privacy-preserving practices in image editing. The code is available at: https://github.com/taco-group/FaceLock.',\n",
       "  'date': '2025-03-15',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.16832',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'AR-Diffusion: Asynchronous Video Generation with Auto-Regressive Diffusion',\n",
       "  'creators': 'SunMingzhen',\n",
       "  'abstractNote': 'The task of video generation requires synthesizing visually realistic and temporally coherent video frames. Existing methods primarily use asynchronous auto-regressive models or synchronous diffusion models to address this challenge. However， asynchronous auto-regressive models often suffer from inconsistencies between training and inference， leading to issues such as error accumulation， while synchronous diffusion models are limited by their reliance on rigid sequence length. To address these issues， we introduce Auto-Regressive Diffusion (AR-Diffusion)， a novel model that combines the strengths of auto-regressive and diffusion models for flexible， asynchronous video generation. Specifically， our approach leverages diffusion to gradually corrupt video frames in both training and inference， reducing the discrepancy between these phases. Inspired by auto-regressive generation， we incorporate a non-decreasing constraint on the corruption timesteps of individual frames， ensuring that earlier frames remain clearer than subsequent ones. This setup， together with temporal causal attention， enables flexible generation of videos with varying lengths while preserving temporal coherence. In addition， we design two specialized timestep schedulers: the FoPP scheduler for balanced timestep sampling during training， and the AD scheduler for flexible timestep differences during inference， supporting both synchronous and asynchronous generation. Extensive experiments demonstrate the superiority of our proposed method， which achieves competitive and state-of-the-art results across four challenging benchmarks.',\n",
       "  'date': '2025-03-10',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2503.07418',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': \"Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model\",\n",
       "  'creators': 'LiuFeng',\n",
       "  'abstractNote': 'As a fundamental backbone for video generation， diffusion models are challenged by low inference speed due to the sequential nature of denoising. Previous methods speed up the models by caching and reusing model outputs at uniformly selected timesteps. However， such a strategy neglects the fact that differences among model outputs are not uniform across timesteps， which hinders selecting the appropriate model outputs to cache， leading to a poor balance between inference efficiency and visual quality. In this study， we introduce Timestep Embedding Aware Cache (TeaCache)， a training-free caching approach that estimates and leverages the fluctuating differences among model outputs across timesteps. Rather than directly using the time-consuming model outputs， TeaCache focuses on model inputs， which have a strong correlation with the modeloutputs while incurring negligible computational cost. TeaCache first modulates the noisy inputs using the timestep embeddings to ensure their differences better approximating those of model outputs. TeaCache then introduces a rescaling strategy to refine the estimated differences and utilizes them to indicate output caching. Experiments show that TeaCache achieves up to 4.41x acceleration over Open-Sora-Plan with negligible (-0.07% Vbench score) degradation of visual quality.',\n",
       "  'date': '2025-03-18',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.19108',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'PhyT2V: LLM-Guided Iterative Self-Refinement for Physics-Grounded Text-to-Video Generation',\n",
       "  'creators': 'XueQiyao',\n",
       "  'abstractNote': \"Text-to-video (T2V) generation has been recently enabled by transformer-based diffusion models， but current T2V models lack capabilities in adhering to the real-world common knowledge and physical rules， due to their limited understanding of physical realism and deficiency in temporal modeling. Existing solutions are either data-driven or require extra model inputs， but cannot be generalizable to out-of-distribution domains. In this paper， we present PhyT2V， a new data-independent T2V technique that expands the current T2V model's capability of video generation to out-of-distribution domains， by enabling chain-of-thought and step-back reasoning in T2V prompting. Our experiments show that PhyT2V improves existing T2V models' adherence to real-world physical rules by 2.3x， and achieves 35% improvement compared to T2V prompt enhancers. The source codes are available at: https://github.com/pittisl/PhyT2V.\",\n",
       "  'date': '2025-04-01',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.00596',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'X-Dyna: Expressive Dynamic Human Image Animation',\n",
       "  'creators': 'ChangDi',\n",
       "  'abstractNote': 'We introduce X-Dyna， a novel zero-shot， diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video， that generates realistic， context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control， X-Dyna addresses key shortcomings causing the loss of dynamic details， enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter， a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control， we connect a local control module with our model to capture identity-disentangled facial expressions， facilitating accurate expression transfer for enhanced realism in animated scenes. Together， these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods， creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.',\n",
       "  'date': '2025-01-20',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.10021',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Cinemo: Consistent and Controllable Image Animation with Motion Diffusion Models',\n",
       "  'creators': 'MaXin',\n",
       "  'abstractNote': 'Diffusion models have achieved great progress in image animation due to powerful generative capabilities. However， maintaining spatio-temporal consistency with detailed information from the input static image over time (e.g.， style， background， and object of the input static image) and ensuring smoothness in animated video narratives guided by textual prompts still remains challenging. In this paper， we introduce Cinemo， a novel image animation approach towards achieving better motion controllability， as well as stronger temporal consistency and smoothness. In general， we propose three effective strategies at the training and inference stages of Cinemo to accomplish our goal. At the training stage， Cinemo focuses on learning the distribution of motion residuals， rather than directly predicting subsequent via a motion diffusion model. Additionally， a structural similarity index-based strategy is proposed to enable Cinemo to have better controllability of motion intensity. At the inference stage， a noise refinement technique based on discrete cosine transformation is introduced to mitigate sudden motion changes. Such three strategies enable Cinemo to produce highly consistent， smooth， and motion-controllable results. Compared to previous methods， Cinemo offers simpler and more precise user controllability. Extensive experiments against several state-of-the-art methods， including both commercial tools and research approaches， across multiple metrics， demonstrate the effectiveness and superiority of our proposed approach.',\n",
       "  'date': '2024-07-23',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2407.15642',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Identity-Preserving Text-to-Video Generation by Frequency Decomposition',\n",
       "  'creators': 'YuanShenghai',\n",
       "  'abstractNote': \"Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in literature: (1) A tuning-free pipeline without tedious case-by-case finetuning， and (2) A frequency-aware heuristic identity-preserving DiT-based control scheme. We propose ConsisID， a tuning-free DiT-based controllable IPT2V model to keep human identity consistent in the generated video. Inspired by prior findings in frequency analysis of diffusion transformers， it employs identity-control signals in the frequency domain， where facial features can be decomposed into low-frequency global features and high-frequency intrinsic features. First， from a low-frequency perspective， we introduce a global facial extractor， which encodes reference images and facial key points into a latent space， generating features enriched with low-frequency information. These features are then integrated into shallow layers of the network to alleviate training challenges associated with DiT. Second， from a high-frequency perspective， we design a local facial extractor to capture high-frequency details and inject them into transformer blocks， enhancing the model's ability to preserve fine-grained features. We propose a hierarchical training strategy to leverage frequency information for identity preservation， transforming a vanilla pre-trained video generation model into an IPT2V model. Extensive experiments demonstrate that our frequency-aware heuristic scheme provides an optimal control solution for DiT-based models. Thanks to this scheme， our ConsisID generates high-quality， identity-preserving videos， making strides towards more effective IPT2V. Code: https://github.com/PKU-YuanGroup/ConsisID.\",\n",
       "  'date': '2025-03-25',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.17440',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Multimedia']},\n",
       " {'title': 'Generative Photography: Scene-Consistent Camera Control for Realistic Text-to-Image Synthesis',\n",
       "  'creators': 'YuanYu',\n",
       "  'abstractNote': 'Image generation today can produce somewhat realistic images from text prompts. However， if one asks the generator to synthesize a specific camera setting such as creating different fields of view using a 24mm lens versus a 70mm lens， the generator will not be able to interpret and generate scene-consistent images. This limitation not only hinders the adoption of generative tools in professional photography but also highlights the broader challenge of aligning data-driven models with real-world physical settings. In this paper， we introduce Generative Photography， a framework that allows controlling camera intrinsic settings during content generation. The core innovation of this work are the concepts of Dimensionality Lifting and Differential Camera Intrinsics Learning， enabling smooth and consistent transitions across different camera settings. Experimental results show that our method produces significantly more scene-consistent photorealistic images than state-of-the-art models such as Stable Diffusion 3 and FLUX. Our code and additional results are available at https://generative-photography.github.io/project.',\n",
       "  'date': '2025-03-25',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.02168',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Parallelized Autoregressive Visual Generation',\n",
       "  'creators': 'WangYuqing',\n",
       "  'abstractNote': 'Autoregressive models have emerged as a powerful approach for visual generation but suffer from slow inference speed due to their sequential token-by-token prediction process. In this paper， we propose a simple yet effective approach for parallelized autoregressive visual generation that improves generation efficiency while preserving the advantages of autoregressive modeling. Our key insight is that parallel generation depends on visual token dependencies-tokens with weak dependencies can be generated in parallel， while strongly dependent adjacent tokens are difficult to generate together， as their independent sampling may lead to inconsistencies. Based on this observation， we develop a parallel generation strategy that generates distant tokens with weak dependencies in parallel while maintaining sequential generation for strongly dependent local tokens. Our approach can be seamlessly integrated into standard autoregressive models without modifying the architecture or tokenizer. Experiments on ImageNet and UCF-101 demonstrate that our method achieves a 3.6x speedup with comparable quality and up to 9.5x speedup with minimal quality degradation across both image and video generation tasks. We hope this work will inspire future research in efficient visual generation and unified autoregressive modeling. Project page: https://yuqingwang1029.github.io/PAR-project.',\n",
       "  'date': '2025-04-03',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.15119',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'TokenFlow: Unified Image Tokenizer for Multimodal Understanding and Generation',\n",
       "  'creators': 'QuLiao',\n",
       "  'abstractNote': \"We present TokenFlow， a novel unified image tokenizer that bridges the long-standing gap between multimodal understanding and generation. Prior research attempt to employ a single reconstruction-targeted Vector Quantization (VQ) encoder for unifying these two tasks. We observe that understanding and generation require fundamentally different granularities of visual information. This leads to a critical trade-off， particularly compromising performance in multimodal understanding tasks. TokenFlow addresses this challenge through an innovative dual-codebook architecture that decouples semantic and pixel-level feature learning while maintaining their alignment via a shared mapping mechanism. This design enables direct access to both high-level semantic representations crucial for understanding tasks and fine-grained visual features essential for generation through shared indices. Our extensive experiments demonstrate TokenFlow's superiority across multiple dimensions. Leveraging TokenFlow， we demonstrate for the first time that discrete visual input can surpass LLaVA-1.5 13B in understanding performance， achieving a 7.2\\\\% average improvement. For image reconstruction， we achieve a strong FID score of 0.63 at 384*384 resolution. Moreover， TokenFlow establishes state-of-the-art performance in autoregressive image generation with a GenEval score of 0.55 at 256*256 resolution， achieving comparable results to SDXL.\",\n",
       "  'date': '2024-12-04',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.03069',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'SleeperMark: Towards Robust Watermark against Fine-Tuning Text-to-image Diffusion Models',\n",
       "  'creators': 'WangZilan',\n",
       "  'abstractNote': \"Recent advances in large-scale text-to-image (T2I) diffusion models have enabled a variety of downstream applications， including style customization， subject-driven personalization， and conditional generation. As T2I models require extensive data and computational resources for training， they constitute highly valued intellectual property (IP) for their legitimate owners， yet making them incentive targets for unauthorized fine-tuning by adversaries seeking to leverage these models for customized， usually profitable applications. Existing IP protection methods for diffusion models generally involve embedding watermark patterns and then verifying ownership through generated outputs examination， or inspecting the model's feature space. However， these techniques are inherently ineffective in practical scenarios when the watermarked model undergoes fine-tuning， and the feature space is inaccessible during verification ((i.e.， black-box setting). The model is prone to forgetting the previously learned watermark knowledge when it adapts to a new task. To address this challenge， we propose SleeperMark， a novel framework designed to embed resilient watermarks into T2I diffusion models. SleeperMark explicitly guides the model to disentangle the watermark information from the semantic concepts it learns， allowing the model to retain the embedded watermark while continuing to be adapted to new downstream tasks. Our extensive experiments demonstrate the effectiveness of SleeperMark across various types of diffusion models， including latent diffusion models (e.g.， Stable Diffusion) and pixel diffusion models (e.g.， DeepFloyd-IF)， showing robustness against downstream fine-tuning and various attacks at both the image and model levels， with minimal impact on the model's generative capability. The code is available at https://github.com/taco-group/SleeperMark.\",\n",
       "  'date': '2025-03-30',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.04852',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models',\n",
       "  'creators': 'YaoJingfeng',\n",
       "  'abstractNote': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However， recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality， it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently， existing systems often settle for sub-optimal solutions， either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this， we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models， enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE， we build an enhanced DiT baseline with improved training strategies and architecture designs， termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.',\n",
       "  'date': '2025-03-10',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.01423',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'Auto-Encoded Supervision for Perceptual Image Super-Resolution',\n",
       "  'creators': 'LeeMinKyu',\n",
       "  'abstractNote': 'This work tackles the fidelity objective in the perceptual super-resolution~(SR). Specifically， we address the shortcomings of pixel-level $L_\\\\text{p}$ loss ($\\\\mathcal{L}_\\\\text{pix}$) in the GAN-based SR framework. Since $L_\\\\text{pix}$ is known to have a trade-off relationship against perceptual quality， prior methods often multiply a small scale factor or utilize low-pass filters. However， this work shows that these circumventions fail to address the fundamental factor that induces blurring. Accordingly， we focus on two points: 1) precisely discriminating the subcomponent of $L_\\\\text{pix}$ that contributes to blurring， and 2) only guiding based on the factor that is free from this trade-off relationship. We show that they can be achieved in a surprisingly simple manner， with an Auto-Encoder (AE) pretrained with $L_\\\\text{pix}$. Accordingly， we propose the Auto-Encoded Supervision for Optimal Penalization loss ($L_\\\\text{AESOP}$)， a novel loss function that measures distance in the AE space， instead of the raw pixel space. Note that the AE space indicates the space after the decoder， not the bottleneck. By simply substituting $L_\\\\text{pix}$ with $L_\\\\text{AESOP}$， we can provide effective reconstruction guidance without compromising perceptual quality. Designed for simplicity， our method enables easy integration into existing SR frameworks. Experimental results verify that AESOP can lead to favorable results in the perceptual SR task.',\n",
       "  'date': '2025-04-11',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.00124',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Electrical Engineering and Systems Science - Image and Video Processing']},\n",
       " {'title': 'LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes',\n",
       "  'creators': 'XuXiang',\n",
       "  'abstractNote': 'LiDAR data pretraining offers a promising approach to leveraging large-scale， readily available datasets for enhanced data utilization. However， existing methods predominantly focus on sparse voxel representation， overlooking the complementary attributes provided by other LiDAR representations. In this work， we propose LiMoE， a framework that integrates the Mixture of Experts (MoE) paradigm into LiDAR data representation learning to synergistically combine multiple representations， such as range images， sparse voxels， and raw points. Our approach consists of three stages: i) Image-to-LiDAR Pretraining， which transfers prior knowledge from images to point clouds across different representations; ii) Contrastive Mixture Learning (CML)， which uses MoE to adaptively activate relevant attributes from each representation and distills these mixed features into a unified 3D network; iii) Semantic Mixture Supervision (SMS)， which combines semantic logits from multiple representations to boost downstream segmentation performance. Extensive experiments across eleven large-scale LiDAR datasets demonstrate our effectiveness and superiority. The code has been made publicly accessible.',\n",
       "  'date': '2025-03-20',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.04004',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning',\n",
       "   'Computer Science - Robotics']},\n",
       " {'title': 'Omnidirectional Multi-Object Tracking',\n",
       "  'creators': 'LuoKai',\n",
       "  'abstractNote': 'Panoramic imagery， with its 360{\\\\deg} field of view， offers comprehensive information to support Multi-Object Tracking (MOT) in capturing spatial and temporal relationships of surrounding objects. However， most MOT algorithms are tailored for pinhole images with limited views， impairing their effectiveness in panoramic settings. Additionally， panoramic image distortions， such as resolution loss， geometric deformation， and uneven lighting， hinder direct adaptation of existing MOT methods， leading to significant performance degradation. To address these challenges， we propose OmniTrack， an omnidirectional MOT framework that incorporates Tracklet Management to introduce temporal cues， FlexiTrack Instances for object localization and association， and the CircularStatE Module to alleviate image and geometric distortions. This integration enables tracking in panoramic field-of-view scenarios， even under rapid sensor motion. To mitigate the lack of panoramic MOT datasets， we introduce the QuadTrack dataset--a comprehensive panoramic dataset collected by a quadruped robot， featuring diverse challenges such as panoramic fields of view， intense motion， and complex environments. Extensive experiments on the public JRDB dataset and the newly introduced QuadTrack benchmark demonstrate the state-of-the-art performance of the proposed framework. OmniTrack achieves a HOTA score of 26.92% on JRDB， representing an improvement of 3.43%， and further achieves 23.45% on QuadTrack， surpassing the baseline by 6.81%. The established dataset and source code are available at https://github.com/xifen523/OmniTrack.',\n",
       "  'date': '2025-03-23',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2503.04565',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Robotics',\n",
       "   'Electrical Engineering and Systems Science - Image and Video Processing']},\n",
       " {'title': 'Multiple Object Tracking as ID Prediction',\n",
       "  'creators': 'GaoRuopeng',\n",
       "  'abstractNote': \"Multi-Object Tracking (MOT) has been a long-standing challenge in video understanding. A natural and intuitive approach is to split this task into two parts: object detection and association. Most mainstream methods employ meticulously crafted heuristic techniques to maintain trajectory information and compute cost matrices for object matching. Although these methods can achieve notable tracking performance， they often require a series of elaborate handcrafted modifications while facing complicated scenarios. We believe that manually assumed priors limit the method's adaptability and flexibility in learning optimal tracking capabilities from domain-specific data. Therefore， we introduce a new perspective that treats Multiple Object Tracking as an in-context ID Prediction task， transforming the aforementioned object association into an end-to-end trainable task. Based on this， we propose a simple yet effective method termed MOTIP. Given a set of trajectories carried with ID information， MOTIP directly decodes the ID labels for current detections to accomplish the association process. Without using tailored or sophisticated architectures， our method achieves state-of-the-art results across multiple benchmarks by solely leveraging object-level features as tracking cues. The simplicity and impressive results of MOTIP leave substantial room for future advancements， thereby making it a promising baseline for subsequent research. Our code and checkpoints are released at https://github.com/MCG-NJU/MOTIP.\",\n",
       "  'date': '2025-03-24',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2403.16848',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'LLMDet: Learning Strong Open-Vocabulary Object Detectors under the Supervision of Large Language Models',\n",
       "  'creators': 'FuShenghao',\n",
       "  'abstractNote': 'Recent open-vocabulary detectors achieve promising performance with abundant region-level annotated data. In this work， we show that an open-vocabulary detector co-training with a large language model by generating image-level detailed captions for each image can further improve performance. To achieve the goal， we first collect a dataset， GroundingCap-1M， wherein each image is accompanied by associated grounding labels and an image-level detailed caption. With this dataset， we finetune an open-vocabulary detector with training objectives including a standard grounding loss and a caption generation loss. We take advantage of a large language model to generate both region-level short captions for each region of interest and image-level long captions for the whole image. Under the supervision of the large language model， the resulting detector， LLMDet， outperforms the baseline by a clear margin， enjoying superior open-vocabulary ability. Further， we show that the improved LLMDet can in turn build a stronger large multi-modal model， achieving mutual benefits. The code， model， and dataset is available at https://github.com/iSEE-Laboratory/LLMDet.',\n",
       "  'date': '2025-01-31',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.18954',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'NLPrompt: Noise-Label Prompt Learning for Vision-Language Models',\n",
       "  'creators': 'PanBikang',\n",
       "  'abstractNote': 'The emergence of vision-language foundation models， such as CLIP， has revolutionized image-text representation， enabling a broad range of applications via prompt learning. Despite its promise， real-world datasets often contain noisy labels that can degrade prompt learning performance. In this paper， we demonstrate that using mean absolute error (MAE) loss in prompt learning， named PromptMAE， significantly enhances robustness against noisy labels while maintaining high accuracy. Though MAE is straightforward and recognized for its robustness， it is rarely used in noisy-label learning due to its slow convergence and poor performance outside prompt learning scenarios. To elucidate the robustness of PromptMAE， we leverage feature learning theory to show that MAE can suppress the influence of noisy samples， thereby improving the signal-to-noise ratio and enhancing overall robustness. Additionally， we introduce PromptOT， a prompt-based optimal transport data purification method to enhance the robustness further. PromptOT employs text features in vision-language models as prototypes to construct an optimal transportation matrix. This matrix effectively partitions datasets into clean and noisy subsets， allowing for the application of cross-entropy loss to the clean subset and MAE loss to the noisy subset. Our Noise-Label Prompt Learning method， named NLPrompt， offers a simple and efficient approach that leverages the expressive representations and precise alignment capabilities of vision-language models for robust prompt learning. We validate NLPrompt through extensive experiments across various noise settings， demonstrating significant performance improvements.',\n",
       "  'date': '2025-03-26',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.01256',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'RAP: Retrieval-Augmented Personalization for Multimodal Large Language Models',\n",
       "  'creators': 'HaoHaoran',\n",
       "  'abstractNote': \"The development of large language models (LLMs) has significantly enhanced the capabilities of multimodal LLMs (MLLMs) as general assistants. However， lack of user-specific knowledge still restricts their application in human's daily life. In this paper， we introduce the Retrieval Augmented Personalization (RAP) framework for MLLMs' personalization. Starting from a general MLLM， we turn it into a personalized assistant in three steps. (a) Remember: We design a key-value database to store user-related information， e.g.， user's name， avatar and other attributes. (b) Retrieve: When the user initiates a conversation， RAP will retrieve relevant information from the database using a multimodal retriever. (c) Generate: The input query and retrieved concepts' information are fed into MLLMs to generate personalized， knowledge-augmented responses. Unlike previous methods， RAP allows real-time concept editing via updating the external database. To further improve generation quality and alignment with user-specific information， we design a pipeline for data collection and create a specialized dataset for personalized training of MLLMs. Based on the dataset， we train a series of MLLMs as personalized multimodal assistants. By pretraining on large-scale dataset， RAP-MLLMs can generalize to infinite visual concepts without additional finetuning. Our models demonstrate outstanding flexibility and generation quality across a variety of tasks， such as personalized image captioning， question answering and visual recognition. The code， data and models are available at https://hoar012.github.io/RAP-Project/.\",\n",
       "  'date': '2025-03-28',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2410.13360',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computation and Language',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning',\n",
       "   'Computer Science - Multimedia']},\n",
       " {'title': 'DynRefer: Delving into Region-level Multimodal Tasks via Dynamic Resolution',\n",
       "  'creators': 'ZhaoYuzhong',\n",
       "  'abstractNote': 'One fundamental task of multimodal models is to translate referred image regions to human preferred language descriptions. Existing methods， however， ignore the resolution adaptability needs of different tasks， which hinders them to find out precise language descriptions. In this study， we propose a DynRefer approach， to pursue high-accuracy region-level referring through mimicking the resolution adaptability of human visual cognition. During training， DynRefer stochastically aligns language descriptions of multimodal tasks with images of multiple resolutions， which are constructed by nesting a set of random views around the referred region. During inference， DynRefer performs selectively multimodal referring by sampling proper region representations for tasks from the nested views based on image and task priors. This allows the visual information for referring to better match human preferences， thereby improving the representational adaptability of region-level multimodal models. Experiments show that DynRefer brings mutual improvement upon broad tasks including region-level captioning， open-vocabulary region recognition and attribute detection. Furthermore， DynRefer achieves state-of-the-art results on multiple region-level multimodal tasks using a single model. Code is available at https://github.com/callsys/DynRefer.',\n",
       "  'date': '2025-03-02',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2405.16071',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'LSceneLLM: Enhancing Large 3D Scene Understanding Using Adaptive Visual Preferences',\n",
       "  'creators': 'ZhiHongyan',\n",
       "  'abstractNote': \"Research on 3D Vision-Language Models (3D-VLMs) is gaining increasing attention， which is crucial for developing embodied AI within 3D scenes， such as visual navigation and embodied question answering. Due to the high density of visual features， especially in large 3D scenes， accurately locating task-relevant visual information is challenging. Existing works attempt to segment all objects and consider their features as scene representations. However， these task-agnostic object features include much redundant information and missing details for the task-relevant area. To tackle these problems， we propose LSceneLLM， an adaptive framework that automatically identifies task-relevant areas by leveraging LLM's visual preference for different tasks， followed by a plug-and-play scene magnifier module to capture fine-grained details in focused areas. Specifically， a dense token selector examines the attention map of LLM to identify visual preferences for the instruction input. It then magnifies fine-grained details of the focusing area. An adaptive self-attention module is leveraged to fuse the coarse-grained and selected fine-grained visual information. To comprehensively evaluate the large scene understanding ability of 3D-VLMs， we further introduce a cross-room understanding benchmark， XR-Scene， which contains a series of large scene understanding tasks including XR-QA， XR-EmbodiedPlanning， and XR-SceneCaption. Experiments show that our method surpasses existing methods on both large scene understanding and existing scene understanding benchmarks. Plunging our scene magnifier module into the existing 3D-VLMs also brings significant improvement.\",\n",
       "  'date': '2025-02-02',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.01292',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos',\n",
       "  'creators': 'LiuXinhao',\n",
       "  'abstractNote': 'Navigating dynamic urban environments presents significant challenges for embodied agents， requiring advanced spatial reasoning and adherence to common-sense norms. Despite progress， existing visual navigation methods struggle in map-free or off-street settings， limiting the deployment of autonomous agents like last-mile delivery robots. To overcome these obstacles， we propose a scalable， data-driven approach for human-like urban navigation by training agents on thousands of hours of in-the-wild city walking and driving videos sourced from the web. We introduce a simple and scalable data processing pipeline that extracts action supervision from these videos， enabling large-scale imitation learning without costly annotations. Our model learns sophisticated navigation policies to handle diverse challenges and critical scenarios. Experimental results show that training on large-scale， diverse datasets significantly enhances navigation performance， surpassing current methods. This work shows the potential of using abundant online video data to develop robust navigation policies for embodied agents in dynamic urban settings. Project homepage is at https://ai4ce.github.io/CityWalker/.',\n",
       "  'date': '2025-04-22',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.17820',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Robotics']},\n",
       " {'title': 'MambaVision: A Hybrid Mamba-Transformer Vision Backbone',\n",
       "  'creators': 'HatamizadehAli',\n",
       "  'abstractNote': 'We propose a novel hybrid Mamba-Transformer backbone， MambaVision， specifically tailored for vision applications. Our core contribution includes redesigning the Mamba formulation to enhance its capability for efficient modeling of visual features. Through a comprehensive ablation study， we demonstrate the feasibility of integrating Vision Transformers (ViT) with Mamba. Our results show that equipping the Mamba architecture with self-attention blocks in the final layers greatly improves its capacity to capture long-range spatial dependencies. Based on these findings， we introduce a family of MambaVision models with a hierarchical architecture to meet various design criteria. For classification on the ImageNet-1K dataset， MambaVision variants achieve state-of-the-art (SOTA) performance in terms of both Top-1 accuracy and throughput. In downstream tasks such as object detection， instance segmentation， and semantic segmentation on MS COCO and ADE20K datasets， MambaVision outperforms comparably sized backbones while demonstrating favorable performance. Code: https://github.com/NVlabs/MambaVision',\n",
       "  'date': '2025-03-25',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2407.08083',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Rethinking Transformer-Based Blind-Spot Network for Self-Supervised Image Denoising',\n",
       "  'creators': 'LiJunyi',\n",
       "  'abstractNote': 'Blind-spot networks (BSN) have been prevalent neural architectures in self-supervised image denoising (SSID). However， most existing BSNs are conducted with convolution layers. Although transformers have shown the potential to overcome the limitations of convolutions in many image restoration tasks， the attention mechanisms may violate the blind-spot requirement， thereby restricting their applicability in BSN. To this end， we propose to analyze and redesign the channel and spatial attentions to meet the blind-spot requirement. Specifically， channel self-attention may leak the blind-spot information in multi-scale architectures， since the downsampling shuffles the spatial feature into channel dimensions. To alleviate this problem， we divide the channel into several groups and perform channel attention separately. For spatial selfattention， we apply an elaborate mask to the attention matrix to restrict and mimic the receptive field of dilated convolution. Based on the redesigned channel and window attentions， we build a Transformer-based Blind-Spot Network (TBSN)， which shows strong local fitting and global perspective abilities. Furthermore， we introduce a knowledge distillation strategy that distills TBSN into smaller denoisers to improve computational efficiency while maintaining performance. Extensive experiments on real-world image denoising datasets show that TBSN largely extends the receptive field and exhibits favorable performance against state-of-theart SSID methods.',\n",
       "  'date': '2024-12-17',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2404.07846',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Electrical Engineering and Systems Science - Image and Video Processing']},\n",
       " {'title': 'RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer',\n",
       "  'creators': 'LvWenyu',\n",
       "  'abstractNote': 'In this report， we present RT-DETRv2， an improved Real-Time DEtection TRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art real-time detector， RT-DETR， and opens up a set of bag-of-freebies for flexibility and practicality， as well as optimizing the training strategy to achieve enhanced performance. To improve the flexibility， we suggest setting a distinct number of sampling points for features at different scales in the deformable attention to achieve selective multi-scale feature extraction by the decoder. To enhance practicality， we propose an optional discrete sampling operator to replace the grid_sample operator that is specific to RT-DETR compared to YOLOs. This removes the deployment constraints typically associated with DETRs. For the training strategy， we propose dynamic data augmentation and scale-adaptive hyperparameters customization to improve performance without loss of speed. Source code and pre-trained models will be available at https://github.com/lyuwenyu/RT-DETR.',\n",
       "  'date': '2024-07-24',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2407.17140',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'U-Mamba: Enhancing Long-range Dependency for Biomedical Image Segmentation',\n",
       "  'creators': 'MaJun',\n",
       "  'abstractNote': 'Convolutional Neural Networks (CNNs) and Transformers have been the most popular architectures for biomedical image segmentation， but both of them have limited ability to handle long-range dependencies because of inherent locality or computational complexity. To address this challenge， we introduce U-Mamba， a general-purpose network for biomedical image segmentation. Inspired by the State Space Sequence Models (SSMs)， a new family of deep sequence models known for their strong capability in handling long sequences， we design a hybrid CNN-SSM block that integrates the local feature extraction power of convolutional layers with the abilities of SSMs for capturing the long-range dependency. Moreover， U-Mamba enjoys a self-configuring mechanism， allowing it to automatically adapt to various datasets without manual intervention. We conduct extensive experiments on four diverse tasks， including the 3D abdominal organ segmentation in CT and MR images， instrument segmentation in endoscopy images， and cell segmentation in microscopy images. The results reveal that U-Mamba outperforms state-of-the-art CNN-based and Transformer-based segmentation networks across all tasks. This opens new avenues for efficient long-range dependency modeling in biomedical image analysis. The code， models， and data are publicly available at https://wanglab.ai/u-mamba.html.',\n",
       "  'date': '2024-01-09',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2401.04722',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning',\n",
       "   'Electrical Engineering and Systems Science - Image and Video Processing']},\n",
       " {'title': 'Fusion-Mamba for Cross-modality Object Detection',\n",
       "  'creators': 'DongWenhao',\n",
       "  'abstractNote': 'Cross-modality fusing complementary information from different modalities effectively improves object detection performance， making it more useful and robust for a wider range of applications. Existing fusion strategies combine different types of images or merge different backbone features through elaborated neural network modules. However， these methods neglect that modality disparities affect cross-modality fusion performance， as different modalities with different camera focal lengths， placements， and angles are hardly fused. In this paper， we investigate cross-modality fusion by associating cross-modal features in a hidden state space based on an improved Mamba with a gating mechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features into a hidden state space for interaction， thereby reducing disparities between cross-modal features and enhancing the representation consistency of fused features. FMB contains two modules: the State Space Channel Swapping (SSCS) module facilitates shallow feature fusion， and the Dual State Space Fusion (DSSF) enables deep fusion in a hidden state space. Through extensive experiments on public datasets， our proposed approach outperforms the state-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned datasets， demonstrating superior object detection performance. To the best of our knowledge， this is the first work to explore the potential of Mamba for cross-modal fusion and establish a new baseline for cross-modality object detection.',\n",
       "  'date': '2024-04-14',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2404.09146',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'DEIM: DETR with Improved Matching for Fast Convergence',\n",
       "  'creators': 'HuangShihua',\n",
       "  'abstractNote': 'We introduce DEIM， an innovative and efficient training framework designed to accelerate convergence in real-time object detection with Transformer-based architectures (DETR). To mitigate the sparse supervision inherent in one-to-one (O2O) matching in DETR models， DEIM employs a Dense O2O matching strategy. This approach increases the number of positive samples per image by incorporating additional targets， using standard data augmentation techniques. While Dense O2O matching speeds up convergence， it also introduces numerous low-quality matches that could affect performance. To address this， we propose the Matchability-Aware Loss (MAL)， a novel loss function that optimizes matches across various quality levels， enhancing the effectiveness of Dense O2O. Extensive experiments on the COCO dataset validate the efficacy of DEIM. When integrated with RT-DETR and D-FINE， it consistently boosts performance while reducing training time by 50%. Notably， paired with RT-DETRv2， DEIM achieves 53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally， DEIM-trained real-time models outperform leading real-time object detectors， with DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78 FPS on an NVIDIA T4 GPU， respectively， without the need for additional data. We believe DEIM sets a new baseline for advancements in real-time object detection. Our code and pre-trained models are available at https://github.com/ShihuaHuang95/DEIM.',\n",
       "  'date': '2025-03-26',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.04234',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement',\n",
       "  'creators': 'PengYansong',\n",
       "  'abstractNote': 'We introduce D-FINE， a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions， providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation， while also simplifying the residual prediction tasks for deeper layers. Additionally， D-FINE incorporates lightweight optimizations in computationally intensive modules and operations， achieving a better balance between speed and accuracy. Specifically， D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365， D-FINE-L / X attains 57.1% / 59.3% AP， surpassing all existing real-time detectors. Furthermore， our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: https://github.com/Peterande/D-FINE.',\n",
       "  'date': '2024-10-17',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2410.13842',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'No More Strided Convolutions or Pooling: A New CNN Building Block for Low-Resolution Images and Small Objects',\n",
       "  'creators': 'SunkaraRaja',\n",
       "  'abstractNote': 'Convolutional neural networks (CNNs) have made resounding success in many computer vision tasks such as image classification and object detection. However， their performance degrades rapidly on tougher tasks where images are of low resolution or objects are small. In this paper， we point out that this roots in a defective yet common design in existing CNN architectures， namely the use of strided convolution and/or pooling layers， which results in a loss of fine-grained information and learning of less effective feature representations. To this end， we propose a new CNN building block called SPD-Conv in place of each strided convolution layer and each pooling layer (thus eliminates them altogether). SPD-Conv is comprised of a space-to-depth (SPD) layer followed by a non-strided convolution (Conv) layer， and can be applied in most if not all CNN architectures. We explain this new design under two most representative computer vision tasks: object detection and image classification. We then create new CNN architectures by applying SPD-Conv to YOLOv5 and ResNet， and empirically show that our approach significantly outperforms state-of-the-art deep learning models， especially on tougher tasks with low-resolution images and small objects. We have open-sourced our code at https://github.com/LabSAINT/SPD-Conv.',\n",
       "  'date': '2022-08-07',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2208.03641',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'MobileMamba: Lightweight Multi-Receptive Visual Mamba Network',\n",
       "  'creators': 'HeHaoyang',\n",
       "  'abstractNote': 'Previous research on lightweight models has primarily focused on CNNs and Transformer-based designs. CNNs， with their local receptive fields， struggle to capture long-range dependencies， while Transformers， despite their global modeling capabilities， are limited by quadratic computational complexity in high-resolution scenarios. Recently， state-space models have gained popularity in the visual domain due to their linear computational complexity. Despite their low FLOPs， current lightweight Mamba-based models exhibit suboptimal throughput. In this work， we propose the MobileMamba framework， which balances efficiency and performance. We design a three-stage network to enhance inference speed significantly. At a fine-grained level， we introduce the Multi-Receptive Field Feature Interaction(MRFFI) module， comprising the Long-Range Wavelet Transform-Enhanced Mamba(WTE-Mamba)， Efficient Multi-Kernel Depthwise Convolution(MK-DeConv)， and Eliminate Redundant Identity components. This module integrates multi-receptive field information and enhances high-frequency detail extraction. Additionally， we employ training and testing strategies to further improve performance and efficiency. MobileMamba achieves up to 83.6% on Top-1， surpassing existing state-of-the-art methods which is maximum x21 faster than LocalVim on GPU. Extensive experiments on high-resolution downstream tasks demonstrate that MobileMamba surpasses current efficient models， achieving an optimal balance between speed and accuracy.',\n",
       "  'date': '2024-11-24',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2411.15941',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces',\n",
       "  'creators': 'GuAlbert',\n",
       "  'abstractNote': 'Foundation models， now powering most of the exciting applications in deep learning， are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention， gated convolution and recurrent models， and structured state space models (SSMs) have been developed to address Transformers’ computational inefficiency on long sequences， but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning， and make several improvements. First， simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities， allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second， even though this change prevents the use of efficient convolutions， we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5× higher throughput than Transformers) and linear scaling in sequence length， and its performance improves on real data up to million-length sequences. As a general sequence model backbone， Mamba achieves state-of-the-art performance across several modalities such as language， audio， and genomics. On language modeling， our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size， both in pretraining and downstream evaluation.',\n",
       "  'date': '2024-05-31',\n",
       "  'language': 'en',\n",
       "  'url': 'http://arxiv.org/abs/2312.00752',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model',\n",
       "  'creators': 'ZhuLianghui',\n",
       "  'abstractNote': 'Recently the state space models (SSMs) with efficient hardware-aware designs， i.e.， the Mamba deep learning model， have shown great potential for long sequence modeling. Meanwhile building efficient and generic vision backbones purely upon SSMs is an appealing direction. However， representing visual data is challenging for SSMs due to the position-sensitivity of visual data and the requirement of global context for visual understanding. In this paper， we show that the reliance on self-attention for visual representation learning is not necessary and propose a new generic vision backbone with bidirectional Mamba blocks (Vim)， which marks the image sequences with position embeddings and compresses the visual representation with bidirectional state space models. On ImageNet classification， COCO object detection， and ADE20k semantic segmentation tasks， Vim achieves higher performance compared to well-established vision transformers like DeiT， while also demonstrating significantly improved computation & memory efficiency. For example， Vim is 2.8$\\\\times$ faster than DeiT and saves 86.8% GPU memory when performing batch inference to extract features on images with a resolution of 1248$\\\\times$1248. The results demonstrate that Vim is capable of overcoming the computation & memory constraints on performing Transformer-style understanding for high-resolution images and it has great potential to be the next-generation backbone for vision foundation models. Code is available at https://github.com/hustvl/Vim.',\n",
       "  'date': '2024-11-14',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2401.09417',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'Mamba YOLO: A Simple Baseline for Object Detection with State Space Model',\n",
       "  'creators': 'WangZeyu',\n",
       "  'abstractNote': \"Driven by the rapid development of deep learning technology， the YOLO series has set a new benchmark for real-time object detectors. Additionally， transformer-based structures have emerged as the most powerful solution in the field， greatly extending the model's receptive field and achieving significant performance improvements. However， this improvement comes at a cost as the quadratic complexity of the self-attentive mechanism increases the computational burden of the model. To address this problem， we introduce a simple yet effective baseline approach called Mamba YOLO. Our contributions are as follows: 1) We propose that the ODMamba backbone introduce a \\\\textbf{S}tate \\\\textbf{S}pace \\\\textbf{M}odel (\\\\textbf{SSM}) with linear complexity to address the quadratic complexity of self-attention. Unlike the other Transformer-base and SSM-base method， ODMamba is simple to train without pretraining. 2) For real-time requirement， we designed the macro structure of ODMamba， determined the optimal stage ratio and scaling size. 3) We design the RG Block that employs a multi-branch structure to model the channel dimensions， which addresses the possible limitations of SSM in sequence modeling， such as insufficient receptive fields and weak image localization. This design captures localized image dependencies more accurately and significantly. Extensive experiments on the publicly available COCO benchmark dataset show that Mamba YOLO achieves state-of-the-art performance compared to previous methods. Specifically， a tiny version of Mamba YOLO achieves a \\\\textbf{7.5}\\\\% improvement in mAP on a single 4090 GPU with an inference time of \\\\textbf{1.5} ms. The pytorch code is available at: \\\\url{https://github.com/HZAI-ZJNU/Mamba-YOLO}\",\n",
       "  'date': '2024-12-14',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2406.05835',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation',\n",
       "  'creators': 'WangZiyang',\n",
       "  'abstractNote': \"In recent advancements in medical image analysis， Convolutional Neural Networks (CNN) and Vision Transformers (ViT) have set significant benchmarks. While the former excels in capturing local features through its convolution operations， the latter achieves remarkable global context understanding by leveraging self-attention mechanisms. However， both architectures exhibit limitations in efficiently modeling long-range dependencies within medical images， which is a critical aspect for precise segmentation. Inspired by the Mamba architecture， known for its proficiency in handling long sequences and global contextual information with enhanced computational efficiency as a State Space Model (SSM)， we propose Mamba-UNet， a novel architecture that synergizes the U-Net in medical image segmentation with Mamba's capability. Mamba-UNet adopts a pure Visual Mamba (VMamba)-based encoder-decoder structure， infused with skip connections to preserve spatial information across different scales of the network. This design facilitates a comprehensive feature learning process， capturing intricate details and broader semantic contexts within medical images. We introduce a novel integration mechanism within the VMamba blocks to ensure seamless connectivity and information flow between the encoder and decoder paths， enhancing the segmentation performance. We conducted experiments on publicly available ACDC MRI Cardiac segmentation dataset， and Synapse CT Abdomen segmentation dataset. The results show that Mamba-UNet outperforms several types of UNet in medical image segmentation under the same hyper-parameter setting. The source code and baseline implementations are available.\",\n",
       "  'date': '2024-03-30',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2402.05079',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Electrical Engineering and Systems Science - Image and Video Processing']},\n",
       " {'title': 'Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection',\n",
       "  'creators': 'YangJiangnan',\n",
       "  'abstractNote': \"These recent years have witnessed that convolutional neural network (CNN)-based methods for detecting infrared small targets have achieved outstanding performance. However， these methods typically employ standard convolutions， neglecting to consider the spatial characteristics of the pixel distribution of infrared small targets. Therefore， we propose a novel pinwheel-shaped convolution (PConv) as a replacement for standard convolutions in the lower layers of the backbone network. PConv better aligns with the pixel Gaussian spatial distribution of dim small targets， enhances feature extraction， significantly increases the receptive field， and introduces only a minimal increase in parameters. Additionally， while recent loss functions combine scale and location losses， they do not adequately account for the varying sensitivity of these losses across different target scales， limiting detection performance on dim-small targets. To overcome this， we propose a scale-based dynamic (SD) Loss that dynamically adjusts the influence of scale and location losses based on target size， improving the network's ability to detect targets of varying scales. We construct a new benchmark， SIRST-UAVB， which is the largest and most challenging dataset to date for real-shot single-frame infrared small target detection. Lastly， by integrating PConv and SD Loss into the latest small target detection algorithms， we achieved significant performance improvements on IRSTD-1K and our SIRST-UAVB dataset， validating the effectiveness and generalizability of our approach. Code -- https://github.com/JN-Yang/PConv-SDloss-Data\",\n",
       "  'date': '2024-12-22',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.16986',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'DuAT: Dual-Aggregation Transformer Network for Medical Image Segmentation',\n",
       "  'creators': 'TangFeilong',\n",
       "  'abstractNote': 'Transformer-based models have been widely demonstrated to be successful in computer vision tasks by modelling long-range dependencies and capturing global representations. However， they are often dominated by features of large patterns leading to the loss of local details (e.g.， boundaries and small objects)， which are critical in medical image segmentation. To alleviate this problem， we propose a Dual-Aggregation Transformer Network called DuAT， which is characterized by two innovative designs， namely， the Global-to-Local Spatial Aggregation (GLSA) and Selective Boundary Aggregation (SBA) modules. The GLSA has the ability to aggregate and represent both global and local spatial features， which are beneficial for locating large and small objects， respectively. The SBA module is used to aggregate the boundary characteristic from low-level features and semantic information from high-level features for better preserving boundary details and locating the re-calibration objects. Extensive experiments in six benchmark datasets demonstrate that our proposed model outperforms state-of-the-art methods in the segmentation of skin lesion images， and polyps in colonoscopy images. In addition， our approach is more robust than existing methods in various challenging situations such as small object segmentation and ambiguous object boundaries.',\n",
       "  'date': '2022-12-21',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2212.11677',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Mamba YOLO: A Simple Baseline for Object Detection with State Space Model',\n",
       "  'creators': 'WangZeyu',\n",
       "  'abstractNote': \"Driven by the rapid development of deep learning technology， the YOLO series has set a new benchmark for real-time object detectors. Additionally， transformer-based structures have emerged as the most powerful solution in the field， greatly extending the model's receptive field and achieving significant performance improvements. However， this improvement comes at a cost as the quadratic complexity of the self-attentive mechanism increases the computational burden of the model. To address this problem， we introduce a simple yet effective baseline approach called Mamba YOLO. Our contributions are as follows: 1) We propose that the ODMamba backbone introduce a \\\\textbf{S}tate \\\\textbf{S}pace \\\\textbf{M}odel (\\\\textbf{SSM}) with linear complexity to address the quadratic complexity of self-attention. Unlike the other Transformer-base and SSM-base method， ODMamba is simple to train without pretraining. 2) For real-time requirement， we designed the macro structure of ODMamba， determined the optimal stage ratio and scaling size. 3) We design the RG Block that employs a multi-branch structure to model the channel dimensions， which addresses the possible limitations of SSM in sequence modeling， such as insufficient receptive fields and weak image localization. This design captures localized image dependencies more accurately and significantly. Extensive experiments on the publicly available COCO benchmark dataset show that Mamba YOLO achieves state-of-the-art performance compared to previous methods. Specifically， a tiny version of Mamba YOLO achieves a \\\\textbf{7.5}\\\\% improvement in mAP on a single 4090 GPU with an inference time of \\\\textbf{1.5} ms. The pytorch code is available at: \\\\url{https://github.com/HZAI-ZJNU/Mamba-YOLO}\",\n",
       "  'date': '2024-12-14',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2406.05835',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection',\n",
       "  'creators': 'ZhangHao',\n",
       "  'abstractNote': 'We present DINO (\\\\textbf{D}ETR with \\\\textbf{I}mproved de\\\\textbf{N}oising anch\\\\textbf{O}r boxes)， a state-of-the-art end-to-end object detector. % in this paper. DINO improves over previous DETR-like models in performance and efficiency by using a contrastive way for denoising training， a mixed query selection method for anchor initialization， and a look forward twice scheme for box prediction. DINO achieves $49.4$AP in $12$ epochs and $51.3$AP in $24$ epochs on COCO with a ResNet-50 backbone and multi-scale features， yielding a significant improvement of $\\\\textbf{+6.0}$\\\\textbf{AP} and $\\\\textbf{+2.7}$\\\\textbf{AP}， respectively， compared to DN-DETR， the previous best DETR-like model. DINO scales well in both model size and data size. Without bells and whistles， after pre-training on the Objects365 dataset with a SwinL backbone， DINO obtains the best results on both COCO \\\\texttt{val2017} ($\\\\textbf{63.2}$\\\\textbf{AP}) and \\\\texttt{test-dev} (\\\\textbf{$\\\\textbf{63.3}$AP}). Compared to other models on the leaderboard， DINO significantly reduces its model size and pre-training data size while achieving better results. Our code will be available at \\\\url{https://github.com/IDEACVR/DINO}.',\n",
       "  'date': '2022-07-11',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2203.03605',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'VMamba: Visual State Space Model',\n",
       "  'creators': 'LiuYue',\n",
       "  'abstractNote': \"Designing computationally efficient network architectures remains an ongoing necessity in computer vision. In this paper， we adapt Mamba， a state-space language model， into VMamba， a vision backbone with linear time complexity. At the core of VMamba is a stack of Visual State-Space (VSS) blocks with the 2D Selective Scan (SS2D) module. By traversing along four scanning routes， SS2D bridges the gap between the ordered nature of 1D selective scan and the non-sequential structure of 2D vision data， which facilitates the collection of contextual information from various sources and perspectives. Based on the VSS blocks， we develop a family of VMamba architectures and accelerate them through a succession of architectural and implementation enhancements. Extensive experiments demonstrate VMamba's promising performance across diverse visual perception tasks， highlighting its superior input scaling efficiency compared to existing benchmark models. Source code is available at https://github.com/MzeroMiko/VMamba.\",\n",
       "  'date': '2024-12-29',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2401.10166',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'BAF-Detector: An Efficient CNN-Based Detector for Photovoltaic Cell Defect Detection',\n",
       "  'creators': 'SuBinyi',\n",
       "  'abstractNote': 'The multi-scale defect detection for photovoltaic (PV) cell electroluminescence (EL) images is a challenging task， due to the feature vanishing as network deepens. To address this problem， an attention-based top-down and bottom-up architecture is developed to accomplish multi-scale feature fusion. This architecture， called Bidirectional Attention Feature Pyramid Network (BAFPN)， can make all layers of the pyramid share similar semantic features. In BAFPN， cosine similarity is employed to measure the importance of each pixel in the fused features. Furthermore， a novel object detector is proposed， called BAF-Detector， which embeds BAFPN into Region Proposal Network (RPN) in Faster RCNN+FPN. BAFPN improves the robustness of the network to scales， thus the proposed detector achieves a good performance in multi-scale defects detection task. Finally， the experimental results on a large-scale EL dataset including 3629 images， 2129 of which are defective， show that the proposed method achieves 98.70% (F-measure)， 88.07% (mAP)， and 73.29% (IoU) in terms of multi-scale defects classification and detection results in raw PV cell EL images.',\n",
       "  'date': '2021-03-28',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2012.10631',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection',\n",
       "  'creators': 'YuanXinbin',\n",
       "  'abstractNote': 'While witnessed with rapid development， remote sensing object detection remains challenging for detecting high aspect ratio objects. This paper shows that large strip convolutions are good feature representation learners for remote sensing object detection and can detect objects of various aspect ratios well. Based on large strip convolutions， we build a new network architecture called Strip R-CNN， which is simple， efficient， and powerful. Unlike recent remote sensing object detectors that leverage large-kernel convolutions with square shapes， our Strip R-CNN takes advantage of sequential orthogonal large strip convolutions to capture spatial information. In addition， we enhance the localization capability of remote-sensing object detectors by decoupling the detection heads and equipping the localization head with strip convolutions to better localize the target objects. Extensive experiments on several benchmarks， e.g.， DOTA， FAIR1M， HRSC2016， and DIOR， show that our Strip R-CNN can largely improve previous works. Notably， our 30M model achieves 82.75% mAP on DOTA-v1.0， setting a new state-of-the-art record.Code is available at https://github.com/YXB-NKU/Strip-R-CNN.',\n",
       "  'date': '2025-01-10',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.03775',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Deformable DETR: Deformable Transformers for End-to-End Object Detection',\n",
       "  'creators': 'ZhuXizhou',\n",
       "  'abstractNote': 'DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However， it suffers from slow convergence and limited feature spatial resolution， due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues， we proposed Deformable DETR， whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. Code is released at https://github.com/fundamentalvision/Deformable-DETR.',\n",
       "  'date': '2021-03-18',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2010.04159',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'YOLOv12: Attention-Centric Real-Time Object Detectors',\n",
       "  'creators': 'TianYunjie',\n",
       "  'abstractNote': 'Enhancing the network architecture of the YOLO framework has been crucial for a long time， but has focused on CNN-based improvements despite the proven superiority of attention mechanisms in modeling capabilities. This is because attention-based models cannot match the speed of CNN-based models. This paper proposes an attention-centric YOLO framework， namely YOLOv12， that matches the speed of previous CNN-based ones while harnessing the performance benefits of attention mechanisms. YOLOv12 surpasses all popular real-time object detectors in accuracy with competitive speed. For example， YOLOv12-N achieves 40.6% mAP with an inference latency of 1.64 ms on a T4 GPU， outperforming advanced YOLOv10-N / YOLOv11-N by 2.1%/1.2% mAP with a comparable speed. This advantage extends to other model scales. YOLOv12 also surpasses end-to-end real-time detectors that improve DETR， such as RT-DETR / RT-DETRv2: YOLOv12-S beats RT-DETR-R18 / RT-DETRv2-R18 while running 42% faster， using only 36% of the computation and 45% of the parameters. More comparisons are shown in Figure 1.',\n",
       "  'date': '2025-02-18',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2502.12524',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'End-to-End Object Detection with Transformers',\n",
       "  'creators': 'CarionNicolas',\n",
       "  'abstractNote': 'We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline， effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework， called DEtection TRansformer or DETR， are a set-based global loss that forces unique predictions via bipartite matching， and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries， DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library， unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover， DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. Training code and pretrained models are available at https://github.com/facebookresearch/detr.',\n",
       "  'date': '2020-05-28',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2005.12872',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'BiFormer: Vision Transformer with Bi-Level Routing Attention',\n",
       "  'creators': 'ZhuLei',\n",
       "  'abstractNote': 'As the core building block of vision transformers， attention is a powerful tool to capture long-range dependency. However， such power comes at a cost: it incurs a huge computation burden and heavy memory footprint as pairwise token interaction across all spatial locations is computed. A series of works attempt to alleviate this problem by introducing handcrafted and content-agnostic sparsity into attention， such as restricting the attention operation to be inside local windows， axial stripes， or dilated windows. In contrast to these approaches， we propose a novel dynamic sparse attention via bi-level routing to enable a more flexible allocation of computations with content awareness. Specifically， for a query， irrelevant key-value pairs are first filtered out at a coarse region level， and then fine-grained token-to-token attention is applied in the union of remaining candidate regions (\\\\ie， routed regions). We provide a simple yet effective implementation of the proposed bi-level routing attention， which utilizes the sparsity to save both computation and memory while involving only GPU-friendly dense matrix multiplications. Built with the proposed bi-level routing attention， a new general vision transformer， named BiFormer， is then presented. As BiFormer attends to a small subset of relevant tokens in a \\\\textbf{query adaptive} manner without distraction from other irrelevant ones， it enjoys both good performance and high computational efficiency， especially in dense prediction tasks. Empirical results across several computer vision tasks such as image classification， object detection， and semantic segmentation verify the effectiveness of our design. Code is available at \\\\url{https://github.com/rayleizhu/BiFormer}.',\n",
       "  'date': '2023-03-15',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2303.08810',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Masked Autoencoders Are Scalable Vision Learners',\n",
       "  'creators': 'HeKaiming',\n",
       "  'abstractNote': 'This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the input image and reconstruct the missing pixels. It is based on two core designs. First， we develop an asymmetric encoder-decoder architecture， with an encoder that operates only on the visible subset of patches (without mask tokens)， along with a lightweight decoder that reconstructs the original image from the latent representation and mask tokens. Second， we find that masking a high proportion of the input image， e.g.， 75%， yields a nontrivial and meaningful self-supervisory task. Coupling these two designs enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity models that generalize well: e.g.， a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream tasks outperforms supervised pre-training and shows promising scaling behavior.',\n",
       "  'date': '2021-12-19',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2111.06377',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Rich feature hierarchies for accurate object detection and semantic segmentation',\n",
       "  'creators': 'GirshickRoss',\n",
       "  'abstractNote': 'Object detection performance， as measured on the canonical PASCAL VOC dataset， has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper， we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30% relative to the previous best result on VOC 2012---achieving a mAP of 53.3%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce， supervised pre-training for an auxiliary task， followed by domain-specific fine-tuning， yields a significant performance boost. Since we combine region proposals with CNNs， we call our method R-CNN: Regions with CNN features. We also compare R-CNN to OverFeat， a recently proposed sliding-window detector based on a similar CNN architecture. We find that R-CNN outperforms OverFeat by a large margin on the 200-class ILSVRC2013 detection dataset. Source code for the complete system is available at http://www.cs.berkeley.edu/~rbg/rcnn.',\n",
       "  'date': '2014-10-22',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/1311.2524',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Fast R-CNN',\n",
       "  'creators': 'GirshickRoss',\n",
       "  'abstractNote': 'This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work， Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN， is 213x faster at test-time， and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet， Fast R-CNN trains VGG16 3x faster， tests 10x faster， and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.',\n",
       "  'date': '2015-09-27',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/1504.08083',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'Attention Is All You Need',\n",
       "  'creators': 'VaswaniAshish',\n",
       "  'abstractNote': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture， the Transformer， based solely on attention mechanisms， dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task， improving over the existing best results， including ensembles， by over 2 BLEU. On the WMT 2014 English-to-French translation task， our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs， a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.',\n",
       "  'date': '2023-08-02',\n",
       "  'language': 'en',\n",
       "  'url': 'http://arxiv.org/abs/1706.03762',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computation and Language',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'Mask R-CNN',\n",
       "  'creators': 'HeKaiming',\n",
       "  'abstractNote': 'We present a conceptually simple， flexible， and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method， called Mask R-CNN， extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN， running at 5 fps. Moreover， Mask R-CNN is easy to generalize to other tasks， e.g.， allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges， including instance segmentation， bounding-box object detection， and person keypoint detection. Without bells and whistles， Mask R-CNN outperforms all existing， single-model entries on every task， including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code has been made available at: https://github.com/facebookresearch/Detectron',\n",
       "  'date': '2018-01-24',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/1703.06870',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale',\n",
       "  'creators': 'DosovitskiyAlexey',\n",
       "  'abstractNote': 'While the Transformer architecture has become the de-facto standard for natural language processing tasks， its applications to computer vision remain limited. In vision， attention is either applied in conjunction with convolutional networks， or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet， CIFAR-100， VTAB， etc.)， Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.',\n",
       "  'date': '2021-06-03',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2010.11929',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning',\n",
       "  'creators': 'DeepSeek-AI',\n",
       "  'abstractNote': 'We introduce our first-generation reasoning models， DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero， a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step， demonstrates remarkable reasoning capabilities. Through RL， DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However， it encounters challenges such as poor readability， and language mixing. To address these issues and further enhance reasoning performance， we introduce DeepSeek-R1， which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community， we open-source DeepSeek-R1-Zero， DeepSeek-R1， and six dense models (1.5B， 7B， 8B， 14B， 32B， 70B) distilled from DeepSeek-R1 based on Qwen and Llama.',\n",
       "  'date': '2025-01-22',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2501.12948',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computation and Language',\n",
       "   'Computer Science - Machine Learning']},\n",
       " {'title': 'DeepSeek-V3 Technical Report',\n",
       "  'creators': 'DeepSeek-AI',\n",
       "  'abstractNote': 'We present DeepSeek-V3， a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training， DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures， which were thoroughly validated in DeepSeek-V2. Furthermore， DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens， followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance， DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition， its training process is remarkably stable. Throughout the entire training process， we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3.',\n",
       "  'date': '2024-12-27',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2412.19437',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computation and Language']},\n",
       " {'title': 'Coordinated Lane-Changing Scheduling of Multilane Cav Platoons in Heterogeneous Scenarios',\n",
       "  'creators': 'LiuQingquan',\n",
       "  'abstractNote': 'With the development of sensing， communication and automated driving technology， connected and automated vehicles (CAVs) are becoming promising solutions for future transport requirements. It is widely believed that a vehicle platoon is a good form to organize urban traffic in the CAV era. Due to the multicommodity nature of urban traffic streams， vehicles will continuously leave and join a multilane platoon， which inevitably gives rise to the need of lane changing within a multilane platoon. This paper studies the coordinated lane-changing scheduling problem in a CAV platoon， with the goal of transferring the platoon from an initial state to a target state to minimize a certain cost measurement (e.g.， number of steps)， while heterogeneous scenarios are considered. Two approaches， i.e.， an exact and an approximate approach， are proposed. For the exact approach， we formulate an integer linear programming (ILP) model to identify the global optimal solution. Multiple objective functions are defined to meet the different needs. To relieve the computational issue of the exact approach， we further propose a tree-based heuristic search (THS)， an approximate algorithm framework. THS is able to obtain an acceptable solution with negligible computational effort， and has the potential to handle the scheduling problem with more precise modeling or larger platoons. Numerical experiments are conducted to demonstrate the performance of different algorithms on both smalland large-scale cases (with up to 60 vehicles in a platoon)， and the parameter combinations in the THS are tested for the optimal trade-off between solution quality and computational load. The findings indicate that ILP is practical for small- or medium-scale cases， which can generate multiple optimal solutions for different objectives; THS can solve large-scale cases in milliseconds on an ordinary personal computer， while the acquired solution is verified to be only slightly worse than the global optimum.',\n",
       "  'date': '2022-07-11',\n",
       "  'language': 'en',\n",
       "  'url': 'https://papers.ssrn.com/abstract=4154966',\n",
       "  'libraryCatalog': 'papers.ssrn.com',\n",
       "  'tags': ['connected and automated vehicle',\n",
       "   'heuristic algorithm',\n",
       "   'integer linear programming',\n",
       "   'lane changing',\n",
       "   'multilane platoon']}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprint_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d7645ab54db92f5e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:21:18.240823Z",
     "start_time": "2025-07-25T14:21:18.222814Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'key': 'N48QCALG', 'version': 4973, 'itemType': 'conferencePaper', 'title': 'A deep learning based approach for detecting panels in photovoltaic plants', 'creators': [{'creatorType': 'author', 'firstName': 'Antonio', 'lastName': 'Greco'}, {'creatorType': 'author', 'firstName': 'Christopher', 'lastName': 'Pironti'}, {'creatorType': 'author', 'firstName': 'Alessia', 'lastName': 'Saggese'}, {'creatorType': 'author', 'firstName': 'Mario', 'lastName': 'Vento'}, {'creatorType': 'author', 'firstName': 'Vincenzo', 'lastName': 'Vigilante'}], 'abstractNote': 'Photovoltaic (PV) panels are a clean and widespread way to produce renewable energy from sunlight; at the same time, such plants require maintenance, since solar panels can be affected by many types of damaging factors and have a limited yet variable lifespan. With the impressive growth of such PV installations, it is in the public eye the need of a cheap and effective way to continuously monitor the state of the plants and a standard technique designed to promptly replace broken modules, in order to prevent drops in the energy production. Since the faults mainly appear as Hot Spots on the surface of the PV panels, aerial thermal imaging can be used to diagnose such problems and also locate them in huge plants. To this aim, dedicated automatic Computer Vision methods are able to automatically find hot spots from thermal images, where they appear as white stains. In these methods a fundamental step is the segmentation of the PV panels, which allows to automatically detect each module.', 'date': '2020-01-07', 'proceedingsTitle': 'Proceedings of the 3rd International Conference on Applications of Intelligent Systems', 'conferenceName': 'APPIS 2020: 3rd International Conference on Applications of Intelligent Systems', 'place': 'Las Palmas de Gran Canaria Spain', 'publisher': 'ACM', 'volume': '', 'pages': '1-7', 'series': '', 'language': 'en', 'DOI': '10.1145/3378184.3378185', 'ISBN': '978-1-4503-7630-3', 'shortTitle': '', 'url': 'https://dl.acm.org/doi/10.1145/3378184.3378185', 'accessDate': '2025-06-24T02:58:35Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-06-24T02:58:35Z', 'dateModified': '2025-06-24T02:58:35Z'}\n",
      "{'key': 'VHECFUDS', 'version': 4905, 'itemType': 'conferencePaper', 'title': 'Real-time model base fault diagnosis of PV panels using statistical signal processing', 'creators': [{'creatorType': 'author', 'firstName': 'M.', 'lastName': 'Davarifar'}, {'creatorType': 'author', 'firstName': 'A.', 'lastName': 'Rabhi'}, {'creatorType': 'author', 'firstName': 'A.', 'lastName': 'El-Hajjaji'}, {'creatorType': 'author', 'firstName': 'M.', 'lastName': 'Dahmane'}], 'abstractNote': 'This paper proposes new method of monitoring and fault detection in photovoltaic systems, based mainly on the analysis of the power losses of the photovoltaic system (PV) by using statistical signal processing. Firstly, real time new universal circuit based model of photovoltaic panels is presented. Then, the development of software fault detection on a real installation is performed under the MATLAB/Simulink environment. With model based fault diagnosis analysis, residual signal from comparing Simulink and real model is generated. To observe clear alarm signal from arbitrary data captured, Wald test technic is applied on residual signal. A model residual based on Sequential Probability Ratio Test (WSPRT) framework for electrical fault diagnosis in PV system is introduced.', 'date': '2013-10', 'proceedingsTitle': '2013 International Conference on Renewable Energy Research and Applications (ICRERA)', 'conferenceName': '2013 International Conference on Renewable Energy Research and Applications (ICRERA)', 'place': '', 'publisher': '', 'volume': '', 'pages': '599-604', 'series': '', 'language': '', 'DOI': '10.1109/ICRERA.2013.6749826', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/6749826', 'accessDate': '2025-06-09T18:15:09Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'Integrated circuit modeling', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'Monitoring', 'type': 1}, {'tag': 'PV system', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Renewable energy sources', 'type': 1}, {'tag': 'faults diagnosis', 'type': 1}, {'tag': 'real time modeling', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T18:15:09Z', 'dateModified': '2025-06-09T18:15:09Z'}\n",
      "{'key': 'ZLE6HLEH', 'version': 4900, 'itemType': 'conferencePaper', 'title': 'Fault Detection and Diagnosis of Photovoltaic Systems through I-V Curve Analysis', 'creators': [{'creatorType': 'author', 'firstName': 'Batoul', 'lastName': 'Zbib'}, {'creatorType': 'author', 'firstName': 'Hiba', 'lastName': 'Al Sheikh'}], 'abstractNote': 'This work presents an algorithm to detect and diagnose faults in PhotoVoltaic (PV) systems based on the I-V curve analysis. Three types of faults are investigated: mismatch and shading faults, connectivity faults and short circuit faults. The PV system is modeled using MATLAB/Simulink to simulate the faulty I-V curve behavior for each fault. During each simulation, the I-V curve is examined and compared with that during normal operation (without faults), in order to identify and characterize the anomalies. The different faulty modes affect the I-V characteristics of the PV string in different ways, leaving distinct signatures during its operation. Four attributes are extracted allowing for classification of faults into five classes. For some classes involving more than one fault, further analysis and comparison is carried out to allow discrimination between them. Results show that the proposed technique has good performance in detecting faults even if they are not severe.', 'date': '2020-06', 'proceedingsTitle': '2020 International Conference on Electrical, Communication, and Computer Engineering (ICECCE)', 'conferenceName': '2020 International Conference on Electrical, Communication, and Computer Engineering (ICECCE)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-6', 'series': '', 'language': '', 'DOI': '10.1109/ICECCE49384.2020.9179390', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9179390', 'accessDate': '2025-06-09T18:04:29Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Circuit faults', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'I-V curve analysis', 'type': 1}, {'tag': 'Integrated circuit modeling', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'PV string', 'type': 1}, {'tag': 'Radiation effects', 'type': 1}, {'tag': 'Temperature', 'type': 1}, {'tag': 'fault detection and diagnosis', 'type': 1}, {'tag': 'faults', 'type': 1}, {'tag': 'photovoltaic systems', 'type': 1}, {'tag': 'symptoms', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T18:04:29Z', 'dateModified': '2025-06-09T18:04:29Z'}\n",
      "{'key': 'BFGNMI9P', 'version': 4732, 'itemType': 'conferencePaper', 'title': 'Lightweight and Efficient Distributed Photovoltaic Panel Defect Detection Model', 'creators': [{'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Gu'}, {'creatorType': 'author', 'firstName': 'Jianqi', 'lastName': 'Li'}], 'abstractNote': 'In the detection of defects in distributed photovoltaic (PV) panel, it is crucial to balance the high precision required for defect detection with the practical challenges of deploying models on low-resource devices. To address this challenge, this paper proposes the YOLOv8-PV model based on YOLOv8. Firstly, a shared-parameter detection head is designed to accelerate training and better learn defect features. A lightweight and high-performance PCONV is incorporated into the detection head to enhance higher throughput and lower memory access. Additionally, a Context Guided block is introduced to reduce computational complexity. Finally, an improved MLCA attention mechanism is added to enhance detection accuracy. Experimental results on a distributed photovoltaic panel dataset demonstrate that the YOLOv8-PV model achieves a reduction in Params, GFLOPS, and model size by 50.0%, 60.4%, and 46.6%, respectively, compared to the baseline model. Additionally, the mAP50 is improved by 1.8%.', 'date': '2024-09', 'proceedingsTitle': '2024 4th International Conference on Computer Science and Blockchain (CCSB)', 'conferenceName': '2024 4th International Conference on Computer Science and Blockchain (CCSB)', 'place': '', 'publisher': '', 'volume': '', 'pages': '31-35', 'series': '', 'language': '', 'DOI': '10.1109/CCSB63463.2024.10735631', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10735631/', 'accessDate': '2025-04-15T02:45:15Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Computational modeling', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Head', 'type': 1}, {'tag': 'Lightweight', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Performance evaluation', 'type': 1}, {'tag': 'Photovoltaic panel', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Throughput', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'YOLOv8', 'type': 1}, {'tag': 'defect detection', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-04-15T02:45:15Z', 'dateModified': '2025-04-15T02:45:16Z'}\n",
      "{'key': 'ALCVR4SU', 'version': 5091, 'itemType': 'conferencePaper', 'title': \"Run, Don't Walk: Chasing Higher FLOPS for Faster Neural Networks\", 'creators': [{'creatorType': 'author', 'firstName': 'Jierun', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shiu-hong', 'lastName': 'Kao'}, {'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Weipeng', 'lastName': 'Zhuo'}, {'creatorType': 'author', 'firstName': 'Song', 'lastName': 'Wen'}, {'creatorType': 'author', 'firstName': 'Chul-Ho', 'lastName': 'Lee'}, {'creatorType': 'author', 'firstName': 'S.-H. Gary', 'lastName': 'Chan'}], 'abstractNote': 'To design fast neural networks, many works have been focusing on reducing the number of floating-point operations (FLOPs). We observe that such reduction in FLOPs, however, does not necessarily lead to a similar level of reduction in latency. This mainly stems from inefficiently low floating-point operations per second (FLOPS). To achieve faster networks, we revisit popular operators and demonstrate that such low FLOPS is mainly due to frequent memory access of the operators, especially the depthwise convolution. We hence propose a novel partial convolution (PConv) that extracts spatial features more efficiently, by cutting down redundant computation and memory access simultaneously. Building upon our PConv, we further propose FasterNet, a new family of neural networks, which attains substantially higher running speed than others on a wide range of devices, without compromising on accuracy for various vision tasks. For example, on ImageNet1k, our tiny FasterNet-T0 is 2.8×, 3.3×, and 2.4× faster than MobileViT-XXS on GPU, CPU, and ARM processors, respectively, while being 2.9% more accurate. Our large FasterNet-L achieves impressive 83.5% top-1 accuracy, on par with the emerging Swin-B, while having 36% higher inference throughput on GPU, as well as saving 37% compute time on CPU. Code is available at https://github. com/JierunChen/FasterNet.', 'date': '6/2023', 'proceedingsTitle': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Vancouver, BC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '12021-12031', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52729.2023.01157', 'ISBN': '979-8-3503-0129-8', 'shortTitle': \"Run, Don't Walk\", 'url': 'https://ieeexplore.ieee.org/document/10203371/', 'accessDate': '2025-04-08T12:19:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-04-08T12:19:12Z', 'dateModified': '2025-04-08T12:19:12Z'}\n",
      "{'key': 'Y3Q298W3', 'version': 4493, 'itemType': 'conferencePaper', 'title': 'PV-DETR: A Multimodal Fault Detection Model of PV Arrays based on Parallel Block Attention', 'creators': [{'creatorType': 'author', 'firstName': 'Wanghu', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yihua', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Long', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Li'}], 'abstractNote': 'Detection of faults in photovoltaic arrays can reduce power generation losses and extend the equipment’s lifespan. Traditional operation and maintenance of photovoltaic power stations primarily rely on electrical characteristics or infrared images. However, data from a single modality are susceptible to environmental interference, affecting detection accuracy. To address these issues, we propose a model called PV-DETR for fault detection in photovoltaic arrays under complex environmental conditions. This model is an extension of RT-DETRv2, which leverages the Transformer architecture for feature extraction and decoding. The model employs a PResNet50 module instead of the original ResNet50, along with haar wavelet downsampling and a parallel block attention mechanism. The PResNet50 module can reduce dimensionality while minimizing information loss. Haar wavelet downsampling retains the original global information and compresses feature maps effectively, and the parallel block attention mechanism significantly enhances the detection of small infrared targets. Experimental results show that the final PV-DETR model achieves an average accuracy of 89% and an average recall of 85% in fault detection using multimodal data, outperforming existing models, including the original RT-DETRv2.', 'date': '2024-12', 'proceedingsTitle': '2024 IEEE International Conference on Big Data (BigData)', 'conferenceName': '2024 IEEE International Conference on Big Data (BigData)', 'place': '', 'publisher': '', 'volume': '', 'pages': '3609-3615', 'series': '', 'language': '', 'DOI': '10.1109/BigData62323.2024.10826130', 'ISBN': '', 'shortTitle': 'PV-DETR', 'url': 'https://ieeexplore.ieee.org/document/10826130/?arnumber=10826130', 'accessDate': '2025-03-09T14:03:01Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2573-2978', 'tags': [{'tag': 'Arrays', 'type': 1}, {'tag': 'Computational modeling', 'type': 1}, {'tag': 'Data models', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Image coding', 'type': 1}, {'tag': 'Maintenance', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'RT-DETRv2', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'Wavelet transforms', 'type': 1}, {'tag': 'fault detection', 'type': 1}, {'tag': 'infrared images', 'type': 1}, {'tag': 'multimodal', 'type': 1}, {'tag': 'photovoltaic array', 'type': 1}], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-09T14:03:01Z', 'dateModified': '2025-03-09T14:03:01Z'}\n",
      "{'key': 'MEWQIKTH', 'version': 4473, 'itemType': 'conferencePaper', 'title': 'AI-Based PV Panels Inspection using an Advanced YOLO Algorithm', 'creators': [{'creatorType': 'author', 'firstName': 'Agus', 'lastName': 'Haeruman'}], 'abstractNote': 'The rapid growth of solar photovoltaic (PV) systems as green energy sources has gained momentum in recent years. However, the anomalies of PV panel defects can reduce its efficiency and minimize energy harvesting from the plant. The manual inspection of PV panel defects throughout the plant is costly and time-consuming. Thus, implementing more intelligent ways to inspect solar panel defects will provide more benefits than traditional ones. This study presents an implementation of a deep learning model to detect solar panel defects using an advanced object detection algorithm called You Look Only Once, version 7 (YOLOv7). YOLO is a popular algorithm in computer vision for classification and localization. The dataset utilized in this study was sourced from ROBOFLOW, consisting of 1660 infrared images showcasing thermal defects in PV panels. The model was constructed to identify a broader range of images with heterogeneity, leveraging the aforementioned dataset. Following validation, the model demonstrates a mean Average Precision (mAP) of 85.9%. With this accuracy, the model is relevant for real-world applications. This assertion is affirmed by testing the model with additional data from separate video-capturing PV panels. The video was recorded using a drone equipped with a thermal camera.', 'date': '2024-08-15', 'proceedingsTitle': '', 'conferenceName': 'Renewable Energy: Generation and Application', 'place': '', 'publisher': '', 'volume': '', 'pages': '230-237', 'series': '', 'language': 'en', 'DOI': '10.21741/9781644903216-30', 'ISBN': '', 'shortTitle': '', 'url': 'https://www.mrforum.com/product/9781644903216-30', 'accessDate': '2025-03-09T13:34:35Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:34:35Z', 'dateModified': '2025-03-09T13:34:35Z'}\n",
      "{'key': 'W986MNFK', 'version': 4450, 'itemType': 'conferencePaper', 'title': 'Towards Efficient Solar Panel Inspection: A YOLO-based Method for Hotspot Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Muhammad Irshat', 'lastName': 'Ameerdin'}, {'creatorType': 'author', 'firstName': 'Muhammad Herman', 'lastName': 'Jamaluddin'}, {'creatorType': 'author', 'firstName': 'Ahmad Zaki', 'lastName': 'Shukor'}, {'creatorType': 'author', 'firstName': 'Luqman', 'lastName': 'Al Hakim Kamaruzaman'}, {'creatorType': 'author', 'firstName': 'Syazwani', 'lastName': 'Mohamad'}], 'abstractNote': \"Solar energy that captured by the photovoltaic (PV) cells has gained recognition as an important factor in the global search for sustainable and clean energy sources in recent years. One of the Sustainable Development Goals (SDG) that solar technology directly supports is Affordable and Clean Energy. It can help increase access to clean energy sources by improving the efficiency and dependability of solar panels through minimizing its defects. However, a variety of defects can shorten the lifespan and effectiveness of PV array, which are crucial components of solar energy systems. The study concentrates on detecting hotspots on solar panels, identifiable through thermal imaging technology. This project aims to develop a deep learning-based approach for defect detection of solar panels. The project unfolds with a primary goal, that is designing the integration of a thermal sensor and deep learning to detect and identify defects in PV panels. It follows with crafting a robust algorithm within the deep learning environment for effective defect detection and identification. Next, the algorithm's performance will be evaluated, emphasizing its reliability and accuracy in enhancing defect detection. The process begins with physically examining a solar panel, followed by using a drone-mounted thermal camera to capture thermal images. After obtaining enough data, the images undergo model generation by labelling and annotation process using Roboflow. The model is then tested and trained for defect detection using YOLOv8. Once the desired accuracy is reached, the dataset is formatted. A user-friendly graphical interface is developed for ease of interaction. Then, the system's performance is evaluated using a confusion matrix to gauge the effectiveness of the defect detection approach. The panel's defect will be confirmed with the manual inspection. Based on the early result obtained, the model's confidence level that has been acquired is 76%.\", 'date': '2024-05', 'proceedingsTitle': '2024 IEEE 14th Symposium on Computer Applications & Industrial Electronics (ISCAIE)', 'conferenceName': '2024 IEEE 14th Symposium on Computer Applications & Industrial Electronics (ISCAIE)', 'place': '', 'publisher': '', 'volume': '', 'pages': '367-372', 'series': '', 'language': '', 'DOI': '10.1109/ISCAIE61308.2024.10576312', 'ISBN': '', 'shortTitle': 'Towards Efficient Solar Panel Inspection', 'url': 'https://ieeexplore.ieee.org/abstract/document/10576312', 'accessDate': '2025-03-09T13:23:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2836-4317', 'tags': [{'tag': 'Accuracy', 'type': 1}, {'tag': 'Deep Learning', 'type': 1}, {'tag': 'Deep learning', 'type': 1}, {'tag': 'Hotspots', 'type': 1}, {'tag': 'Refining', 'type': 1}, {'tag': 'Roboflow', 'type': 1}, {'tag': 'Solar Panel', 'type': 1}, {'tag': 'Solar energy', 'type': 1}, {'tag': 'System performance', 'type': 1}, {'tag': 'Thermal sensors', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'YOLO', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:23:00Z', 'dateModified': '2025-03-09T13:23:00Z'}\n",
      "{'key': '4KKNSYNT', 'version': 4432, 'itemType': 'conferencePaper', 'title': 'Infrared Thermography Based Hotspot Detection Of Photovoltaic Module using YOLO', 'creators': [{'creatorType': 'author', 'firstName': 'Tahmid', 'lastName': 'Tajwar'}, {'creatorType': 'author', 'firstName': 'Ovib', 'lastName': 'Hassan Mobin'}, {'creatorType': 'author', 'firstName': 'Fariha Reza', 'lastName': 'Khan'}, {'creatorType': 'author', 'firstName': 'Shara Fatema', 'lastName': 'Hossain'}, {'creatorType': 'author', 'firstName': 'Mohaimenul', 'lastName': 'Islam'}, {'creatorType': 'author', 'firstName': 'Md.', 'lastName': 'Mosaddequr Rahman'}], 'abstractNote': 'Regarding clean energy production high curiosity is gained by Solar Photovoltaic (PV) worldwide. Faults in the PV modules cause significant issues for the PV systems. Detecting faults of PV modules could help to take the necessary measures. This study uses Infrared thermography (IRT) to detect the hotspot of PV modules. The objective is to develop a hotspot detection tool using ‘YOLO: You Only Look once.’ The images are converted into a data set for a classifier to detect the hotspot of PV modules. Then the learner is trained and tested with the dataset. After that, the output validates with the IRT images of PV modules. The outcome of this study is to apply a real-time object detection tool identifying the defect of the PV module. The result shows that with a more diversified data set, the confidence of detecting the hotspot increases.', 'date': '2021-05', 'proceedingsTitle': '2021 IEEE 12th Energy Conversion Congress & Exposition - Asia (ECCE-Asia)', 'conferenceName': '2021 IEEE 12th Energy Conversion Congress & Exposition - Asia (ECCE-Asia)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1542-1547', 'series': '', 'language': '', 'DOI': '10.1109/ECCE-Asia49820.2021.9478998', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9478998/?arnumber=9478998', 'accessDate': '2025-03-09T12:57:24Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2150-6086', 'tags': [{'tag': 'Asia', 'type': 1}, {'tag': 'Condition monitoring', 'type': 1}, {'tag': 'Detectors', 'type': 1}, {'tag': 'Infrared thermography', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Production', 'type': 1}, {'tag': 'Tools', 'type': 1}, {'tag': 'YOLO', 'type': 1}, {'tag': 'hotspot', 'type': 1}, {'tag': 'machine learning', 'type': 1}, {'tag': 'photovoltaic', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T12:57:24Z', 'dateModified': '2025-03-09T12:57:24Z'}\n",
      "{'key': 'MIVY4YPK', 'version': 4421, 'itemType': 'conferencePaper', 'title': 'Fault Detection of the Solar Photovoltaic Modules Using YOLO Models', 'creators': [{'creatorType': 'author', 'firstName': 'Parveen', 'lastName': 'Malik'}, {'creatorType': 'author', 'firstName': 'Vatsal', 'lastName': 'Saxena'}, {'creatorType': 'author', 'firstName': 'Shreyansh', 'lastName': 'Raj'}, {'creatorType': 'author', 'firstName': 'Saumy', 'lastName': 'Singh'}, {'creatorType': 'author', 'firstName': 'Sachin', 'lastName': 'Kumar'}, {'creatorType': 'author', 'firstName': 'Ganaraj P.', 'lastName': 'S'}], 'abstractNote': 'The growing adoption of solar panels, driven by climate change concerns, underscores the importance of ensuring the reliability of photovoltaic (PV) modules. However, outdoor PV modules deployment face a range of environmental challenges such as extreme temperatures, chemical exposure, and mechanical stress which can lead to aging, defects, and degradation. This research introduces a novel approach for identifying faults in solar photovoltaic (PV) modules. Leveraging deep learning techniques from the You Only Look Once (YOLO) family, specifically the recent YOLOv8 and YOLOv9 models, this paper aims to enhance the reliability and performance of PV systems by accurately detecting and classifying module defects to a thermal images database containing three photo-voltaic cell defects. By automating the fault detection process through computer vision, this work contributes to the ongoing efforts to optimize solar energy generation and maintenance. Further, YOLOv5, YOLOv6, and YOLOv7 are also trained, validated, and tested. The results showed that the novel technique of the GELAN architecture-based model outperformed all other models trained on the custom dataset of thermal images of solar PV modules, achieving a mean average precision (mAP) of 70.4%.', 'date': '2024-09', 'proceedingsTitle': '2024 IEEE Region 10 Symposium (TENSYMP)', 'conferenceName': '2024 IEEE Region 10 Symposium (TENSYMP)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-6', 'series': '', 'language': '', 'DOI': '10.1109/TENSYMP61132.2024.10752194', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10752194', 'accessDate': '2025-03-09T12:52:51Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2642-6102', 'tags': [{'tag': 'Computational modeling', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Image databases', 'type': 1}, {'tag': 'Maintenance', 'type': 1}, {'tag': 'Microprocessors', 'type': 1}, {'tag': 'Photo Voltaic', 'type': 1}, {'tag': 'Solar energy', 'type': 1}, {'tag': 'Solar panels', 'type': 1}, {'tag': 'Stress', 'type': 1}, {'tag': 'Temperature distribution', 'type': 1}, {'tag': 'YOLO', 'type': 1}, {'tag': 'generalized efficient layer aggregation network', 'type': 1}, {'tag': 'infrared thermography', 'type': 1}, {'tag': 'mAP', 'type': 1}, {'tag': 'object detection', 'type': 1}, {'tag': 'solar cell', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T12:52:51Z', 'dateModified': '2025-03-09T12:52:51Z'}\n",
      "{'key': 'QDGT5X3R', 'version': 5051, 'itemType': 'conferencePaper', 'title': 'Efficient Multi-Scale Attention Module with Cross-Spatial Learning', 'creators': [{'creatorType': 'author', 'firstName': 'Daliang', 'lastName': 'Ouyang'}, {'creatorType': 'author', 'firstName': 'Su', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Guozhong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Mingzhu', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Huaiyong', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Zhan'}, {'creatorType': 'author', 'firstName': 'Zhijie', 'lastName': 'Huang'}], 'abstractNote': 'Remarkable effectiveness of the channel or spatial attention mechanisms for producing more discernible feature representation are illustrated in various computer vision tasks. However, modeling the cross-channel relationships with channel dimensionality reduction may bring side effect in extracting deep visual representations. In this paper, a novel efficient multi-scale attention (EMA) module is proposed. Focusing on retaining the information on per channel and decreasing the computational overhead, we reshape the partly channels into the batch dimensions and group the channel dimensions into multiple sub-features which make the spatial semantic features well-distributed inside each feature group. Specifically, apart from encoding the global information to re-calibrate the channel-wise weight in each parallel branch, the output features of the two parallel branches are further aggregated by a cross-dimension interaction for capturing pixel-level pairwise relationship. We conduct extensive ablation studies and experiments on image classification and object detection tasks with popular benchmarks (e.g., CIFAR-100, ImageNet-1k, MS COCO and VisDrone2019) for evaluating its performance.', 'date': '2023-6-4', 'proceedingsTitle': 'ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)', 'conferenceName': '', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-5', 'series': '', 'language': '', 'DOI': '10.1109/ICASSP49357.2023.10096516', 'ISBN': '', 'shortTitle': '', 'url': 'http://arxiv.org/abs/2305.13563', 'accessDate': '2025-03-07T02:21:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2305.13563 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-03-07T02:21:10Z', 'dateModified': '2025-03-07T02:21:10Z'}\n",
      "{'key': 'RCXCFEG8', 'version': 5056, 'itemType': 'conferencePaper', 'title': 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Mingxing', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Quoc', 'lastName': 'Le'}], 'abstractNote': 'Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are given. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al., 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flower (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.', 'date': '2019-05-24', 'proceedingsTitle': 'Proceedings of the 36th International Conference on Machine Learning', 'conferenceName': 'International Conference on Machine Learning', 'place': '', 'publisher': 'PMLR', 'volume': '', 'pages': '6105-6114', 'series': '', 'language': 'en', 'DOI': '', 'ISBN': '', 'shortTitle': 'EfficientNet', 'url': 'https://proceedings.mlr.press/v97/tan19a.html', 'accessDate': '2025-03-06T18:21:11Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'proceedings.mlr.press', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2640-3498', 'tags': [], 'collections': ['56JUBZGW'], 'relations': {}, 'dateAdded': '2025-03-06T18:21:11Z', 'dateModified': '2025-03-06T18:21:11Z'}\n",
      "{'key': 'P64AG7Y3', 'version': 4343, 'itemType': 'conferencePaper', 'title': 'DETRs Beat YOLOs on Real-time Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yian', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Shangliang', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Jinman', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Guanzhong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qingqing', 'lastName': 'Dang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jie', 'lastName': 'Chen'}], 'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.', 'date': '2024-6-16', 'proceedingsTitle': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '16965-16974', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52733.2024.01605', 'ISBN': '9798350353006', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10657220/', 'accessDate': '2025-03-05T05:32:03Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2025-03-05T05:32:03Z', 'dateModified': '2025-03-05T05:32:03Z'}\n",
      "{'key': 'XHMCU8ZC', 'version': 5043, 'itemType': 'conferencePaper', 'title': 'DETRs Beat YOLOs on Real-time Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yian', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Shangliang', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Jinman', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Guanzhong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qingqing', 'lastName': 'Dang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jie', 'lastName': 'Chen'}], 'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.', 'date': '2024-6-16', 'proceedingsTitle': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '16965-16974', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52733.2024.01605', 'ISBN': '979-8-3503-5300-6', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10657220/', 'accessDate': '2025-03-05T05:31:58Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-03-05T05:31:58Z', 'dateModified': '2025-03-05T05:31:58Z'}\n",
      "{'key': 'YFEBPNNV', 'version': 4320, 'itemType': 'conferencePaper', 'title': 'Strip Pooling: Rethinking Spatial Pooling for Scene Parsing', 'creators': [{'creatorType': 'author', 'firstName': 'Qibin', 'lastName': 'Hou'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ming-Ming', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Feng'}], 'abstractNote': 'Spatial pooling has been proven highly effective in capturing long-range contextual information for pixel-wise prediction tasks, such as scene parsing. In this paper, beyond conventional spatial pooling that usually has a regular shape of N × N , we rethink the formulation of spatial pooling by introducing a new pooling strategy, called strip pooling, which considers a long but narrow kernel, i.e., 1 × N or N × 1. Based on strip pooling, we further investigate spatial pooling architecture design by 1) introducing a new strip pooling module that enables backbone networks to efﬁciently model long-range dependencies, 2) presenting a novel building block with diverse spatial pooling as a core, and 3) systematically comparing the performance of the proposed strip pooling and conventional spatial pooling techniques. Both novel pooling-based designs are lightweight and can serve as an efﬁcient plugand-play module in existing scene parsing networks. Extensive experiments on popular benchmarks (e.g., ADE20K and Cityscapes) demonstrate that our simple approach establishes new state-of-the-art results. Code is available at https://github.com/Andrew-Qibin/SPNet.', 'date': '6/2020', 'proceedingsTitle': '2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '4002-4011', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR42600.2020.00406', 'ISBN': '978-1-72817-168-5', 'shortTitle': 'Strip Pooling', 'url': 'https://ieeexplore.ieee.org/document/9157204/', 'accessDate': '2025-03-02T14:42:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html', 'extra': '', 'tags': [], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-02T14:42:12Z', 'dateModified': '2025-03-02T14:42:12Z'}\n",
      "{'key': 'YWII2CG3', 'version': 5081, 'itemType': 'conferencePaper', 'title': 'Oriented R-CNN for Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Xingxing', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Gong', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Jiabao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiwen', 'lastName': 'Yao'}, {'creatorType': 'author', 'firstName': 'Junwei', 'lastName': 'Han'}], 'abstractNote': '', 'date': '2021', 'proceedingsTitle': '', 'conferenceName': 'Proceedings of the IEEE/CVF International Conference on Computer Vision', 'place': '', 'publisher': '', 'volume': '', 'pages': '3520-3529', 'series': '', 'language': 'en', 'DOI': '', 'ISBN': '', 'shortTitle': '', 'url': 'https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Oriented_R-CNN_for_Object_Detection_ICCV_2021_paper.html', 'accessDate': '2025-02-10T15:09:40Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'openaccess.thecvf.com', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['M6XZJHS6'], 'relations': {}, 'dateAdded': '2025-02-10T15:09:40Z', 'dateModified': '2025-02-10T15:09:40Z'}\n",
      "{'key': 'BE7QF59M', 'version': 5065, 'itemType': 'conferencePaper', 'title': 'ImageNet Classification with Deep Convolutional Neural Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Alex', 'lastName': 'Krizhevsky'}, {'creatorType': 'author', 'firstName': 'Ilya', 'lastName': 'Sutskever'}, {'creatorType': 'author', 'firstName': 'Geoffrey E', 'lastName': 'Hinton'}], 'abstractNote': 'We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\\\\% and 18.9\\\\% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.', 'date': '2012', 'proceedingsTitle': 'Advances in Neural Information Processing Systems', 'conferenceName': '', 'place': '', 'publisher': 'Curran Associates, Inc.', 'volume': '25', 'pages': '', 'series': '', 'language': '', 'DOI': '', 'ISBN': '', 'shortTitle': '', 'url': 'https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html', 'accessDate': '2025-02-10T01:46:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Neural Information Processing Systems', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-10T01:46:00Z', 'dateModified': '2025-02-10T01:46:00Z'}\n",
      "{'key': 'PZ9Y7RSL', 'version': 5093, 'itemType': 'conferencePaper', 'title': 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows', 'creators': [{'creatorType': 'author', 'firstName': 'Ze', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yutong', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Cao'}, {'creatorType': 'author', 'firstName': 'Han', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Yixuan', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Zheng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Stephen', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Baining', 'lastName': 'Guo'}], 'abstractNote': 'This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efﬁciency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the ﬂexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classiﬁcation (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO testdev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-theart by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneﬁcial for all-MLP architectures. The code and models are publicly available at https://github. com/microsoft/Swin-Transformer.', 'date': '10/2021', 'proceedingsTitle': '2021 IEEE/CVF International Conference on Computer Vision (ICCV)', 'conferenceName': '2021 IEEE/CVF International Conference on Computer Vision (ICCV)', 'place': 'Montreal, QC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '9992-10002', 'series': '', 'language': 'en', 'DOI': '10.1109/ICCV48922.2021.00986', 'ISBN': '978-1-6654-2812-5', 'shortTitle': 'Swin Transformer', 'url': 'https://ieeexplore.ieee.org/document/9710580/', 'accessDate': '2025-02-04T07:56:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:56:10Z', 'dateModified': '2025-02-04T07:56:10Z'}\n",
      "{'key': 'YZF4Q3TW', 'version': 5032, 'itemType': 'conferencePaper', 'title': 'Deep Residual Learning for Image Recognition', 'creators': [{'creatorType': 'author', 'firstName': 'Kaiming', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Xiangyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Shaoqing', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Sun'}], 'abstractNote': 'Deeper neural networks are more difﬁcult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers—8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classiﬁcation task. We also present analysis on CIFAR-10 with 100 and 1000 layers.', 'date': '6/2016', 'proceedingsTitle': '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Las Vegas, NV, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '770-778', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR.2016.90', 'ISBN': '978-1-4673-8851-1', 'shortTitle': '', 'url': 'http://ieeexplore.ieee.org/document/7780459/', 'accessDate': '2025-02-04T07:53:20Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:53:20Z', 'dateModified': '2025-02-04T07:53:20Z'}\n",
      "{'key': '27LXG2PB', 'version': 3992, 'itemType': 'conferencePaper', 'title': 'SCConv: Spatial and Channel Reconstruction Convolution for Feature Redundancy', 'creators': [{'creatorType': 'author', 'firstName': 'Jiafeng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Wen'}, {'creatorType': 'author', 'firstName': 'Lianghua', 'lastName': 'He'}], 'abstractNote': 'Convolutional Neural Networks (CNNs) have achieved remarkable performance in various computer vision tasks but this comes at the cost of tremendous computational resources, partly due to convolutional layers extracting redundant features. Recent works either compress well-trained large-scale models or explore well-designed lightweight models. In this paper, we make an attempt to exploit spatial and channel redundancy among features for CNN compression and propose an efficient convolution module, called SCConv (Spatial and Channel reconstruction Convolution), to decrease redundant computing and facilitate representative feature learning. The proposed SCConv consists of two units: spatial reconstruction unit (SRU) and channel reconstruction unit (CRU). SRU utilizes a separate-and-reconstruct method to suppress the spatial redundancy while CRU uses a split-transform-andfuse strategy to diminish the channel redundancy. In addition, SCConv is a plug-and-play architectural unit that can be used to replace standard convolution in various convolutional neural networks directly. Experimental results show that SCConv-embedded models are able to achieve better performance by reducing redundant features with significantly lower complexity and computational costs.', 'date': '6/2023', 'proceedingsTitle': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Vancouver, BC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '6153-6162', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52729.2023.00596', 'ISBN': '979-8-3503-0129-8', 'shortTitle': 'SCConv', 'url': 'https://ieeexplore.ieee.org/document/10204928/', 'accessDate': '2024-12-24T13:56:18Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [{'tag': 'important-model'}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2024-12-24T13:56:18Z', 'dateModified': '2025-01-19T08:00:52Z'}\n",
      "{'key': '64D8V8M5', 'version': 3984, 'itemType': 'conferencePaper', 'title': 'Traffic signal coordination for emergency vehicles', 'creators': [{'creatorType': 'author', 'firstName': 'Wenwen', 'lastName': 'Kang'}, {'creatorType': 'author', 'firstName': 'Gang', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Yisheng', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Xisong', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Fenghua', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Qingjie', 'lastName': 'Kong'}], 'abstractNote': 'Reducing travel time of emergency vehicles (EVs) has a potential in significant savings of life and property. Integrating modern intelligent transportation system (ITS) with EV signal preemption seems to be a solution. But existing EV signal preemption systems often break the current signal coordination and impact a lot on the normal traffic streams. In this paper we propose an emergency vehicle signal coordination (EVSC) approach, which is intended to provide “green wave” for EVs. Traffic simulations are conducted along an emergency corridor with 8 intersections in Qingdao, China. Multiple traffic measurements are compared between simulation outputs with and without EVSC operation. The result indicates that the proposed approach can reduce EV travel time by 26.9% without too much negative impact on the normal traffic streams.', 'date': '2014-10', 'proceedingsTitle': '17th International IEEE Conference on Intelligent Transportation Systems (ITSC)', 'conferenceName': '17th International IEEE Conference on Intelligent Transportation Systems (ITSC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '157-161', 'series': '', 'language': '', 'DOI': '10.1109/ITSC.2014.6957683', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/6957683', 'accessDate': '2024-08-13T10:29:32Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2153-0017', 'tags': [{'tag': 'Cities and towns', 'type': 1}, {'tag': 'Delays', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'Roads', 'type': 1}, {'tag': 'Traffic control', 'type': 1}, {'tag': 'Vehicles', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-13T10:29:32Z', 'dateModified': '2025-01-19T07:17:32Z'}\n",
      "{'key': 'FAEU3M2G', 'version': 3984, 'itemType': 'conferencePaper', 'title': 'Optimal Motion Control for Connected and Automated Electric Vehicles at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Boli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Simos A.', 'lastName': 'Evangelou'}, {'creatorType': 'author', 'firstName': 'Stelios', 'lastName': 'Timotheou'}], 'abstractNote': 'Traffic congestion is one of the major issues for urban traffic networks. The connected and autonomous vehicles (CAV) is an emerging technology that has the potential to address this issue by improving safety, efficiency, and capacity of the transportation system. In this paper, the problem of optimal trajectory planning of battery-electric CAVs in the context of cooperative crossing of an unsignalized intersection is addressed. An optimization-based centralized intersection controller is proposed to find the optimal velocity trajectory of each vehicle so as to minimize electric energy consumption and traffic throughput. Solving the underlying optimization problem for a group of CAVs is not straightforward because of the nonlinear and nonconvex dynamics, especially when the powertrain model is explicitly modelled. In order to ensure a rapid solution search and a unique global optimum, the optimal control problem (OCP) is reformulated via convex modeling techniques. Several simulation case studies show the effectiveness of the proposed approach and the trade-off between energy consumption and traffic throughput is illustrated.', 'date': '2020-12', 'proceedingsTitle': '2020 59th IEEE Conference on Decision and Control (CDC)', 'conferenceName': '2020 59th IEEE Conference on Decision and Control (CDC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '2831-2836', 'series': '', 'language': '', 'DOI': '10.1109/CDC42340.2020.9304392', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9304392', 'accessDate': '2024-07-05T02:46:56Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2576-2370', 'tags': [{'tag': 'Batteries', 'type': 1}, {'tag': 'Energy consumption', 'type': 1}, {'tag': 'Mechanical power transmission', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Torque', 'type': 1}, {'tag': 'Trajectory', 'type': 1}, {'tag': 'Vehicle dynamics', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T02:46:56Z', 'dateModified': '2025-01-19T07:17:30Z'}\n",
      "{'key': 'VWIHH3M2', 'version': 3981, 'itemType': 'conferencePaper', 'title': 'Latency-Robust Control of High-Speed Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Zev', 'lastName': 'Nicolai-Scanio'}, {'creatorType': 'author', 'firstName': 'Zhong-Ping', 'lastName': 'Jiang'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Jin'}], 'abstractNote': \"High-speed signal-free intersections are a novel urban traffic operation enabled by connected and autonomous vehicles. However, the impact of communication latency on intersection performance has not been well understood. In this paper, we consider vehicle coordination at signal-free intersections with latency. We focus on two questions: (i) how to ensure latency-resiliency of the coordination algorithm, and (ii) how latency affects the intersection's capacity. We consider a trajectory-based model with bounded speed uncertainties. Latency leads to uncertain state observation. We propose a piecewise-linear control law that ensures safety (avoidance of interference) as long as the initial condition is safe. We also analytically quantify the throughput that the proposed control can attain in the face of latency.\", 'date': '2021-05', 'proceedingsTitle': '2021 American Control Conference (ACC)', 'conferenceName': '2021 American Control Conference (ACC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '2935-2942', 'series': '', 'language': '', 'DOI': '10.23919/ACC50511.2021.9482689', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9482689', 'accessDate': '2024-07-05T03:30:28Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2378-5861', 'tags': [{'tag': 'Autonomous vehicles', 'type': 1}, {'tag': 'Faces', 'type': 1}, {'tag': 'Interference', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Signal-free intersections', 'type': 1}, {'tag': 'Throughput', 'type': 1}, {'tag': 'Uncertainty', 'type': 1}, {'tag': 'connected and autonomous vehicles', 'type': 1}, {'tag': 'robust control', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:30:28Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': 'V4P85YIY', 'version': 3983, 'itemType': 'conferencePaper', 'title': 'Development of Autonomous Drones for Adaptive Obstacle Avoidance in Real World Environments', 'creators': [{'creatorType': 'author', 'firstName': 'Arne', 'lastName': 'Devos'}, {'creatorType': 'author', 'firstName': 'Emad', 'lastName': 'Ebeid'}, {'creatorType': 'author', 'firstName': 'Poramate', 'lastName': 'Manoonpong'}], 'abstractNote': 'Recently, drones have been involved in several critical tasks such as infrastructure inspection, crisis response, and search and rescue operations. Such drones mostly use sophisticated computer vision techniques to effectively avoid obstacles and, thereby, require high computational power. Therefore, this work tuned and tested a computationally inexpensive algorithm, previously developed by the authors, for adaptive obstacle avoidance control of a drone. The algorithm aims at protecting the drone from entering in complex situations such as deadlocks and corners. The algorithm has been validated through simulation and implemented on a newly developed drone platform for infrastructure inspection. The design of the drone platform and the experimental results are presented in this study.', 'date': '2018-08', 'proceedingsTitle': '2018 21st Euromicro Conference on Digital System Design (DSD)', 'conferenceName': '2018 21st Euromicro Conference on Digital System Design (DSD)', 'place': '', 'publisher': '', 'volume': '', 'pages': '707-710', 'series': '', 'language': '', 'DOI': '10.1109/DSD.2018.00009', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/8491889', 'accessDate': '2024-09-29T06:43:55Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Adaptive obstacle avoidance', 'type': 1}, {'tag': 'Autonomous drone system', 'type': 1}, {'tag': 'Collision avoidance', 'type': 1}, {'tag': 'Drones', 'type': 1}, {'tag': 'Implementation', 'type': 1}, {'tag': 'Laser radar', 'type': 1}, {'tag': 'Navigation', 'type': 1}, {'tag': 'Propellers', 'type': 1}, {'tag': 'Signal processing algorithms', 'type': 1}, {'tag': 'Simulation', 'type': 1}, {'tag': 'System recovery', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:43:55Z', 'dateModified': '2025-01-19T07:17:26Z'}\n",
      "{'key': 'G2TFG8XP', 'version': 3983, 'itemType': 'conferencePaper', 'title': 'Deep Reinforcement Learning for Persistent Cruise Control in UAV-aided Data Collection', 'creators': [{'creatorType': 'author', 'firstName': 'Harrison', 'lastName': 'Kurunathan'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Ni'}, {'creatorType': 'author', 'firstName': 'Eduardo', 'lastName': 'Tovar'}, {'creatorType': 'author', 'firstName': 'Falko', 'lastName': 'Dressler'}], 'abstractNote': 'Autonomous UAV cruising is gaining attention due to its flexible deployment in remote sensing, surveillance, and reconnaissance. A critical challenge in data collection with the autonomous UAV is the buffer overflows at the ground sensors and packet loss due to lossy airborne channels. Trajectory planning of the UAV is vital to alleviate buffer overflows as well as channel fading. In this work, we propose a Deep Deterministic Policy Gradient based Cruise Control (DDPG-CC) to reduce the overall packet loss through online training of headings and cruise velocity of the UAV, as well as the selection of the ground sensors for data collection. Preliminary performance evaluation demonstrates that DDPG-CC reduces the packet loss rate by under 5% when sufficient training is provided to the UAV.', 'date': '2021-10', 'proceedingsTitle': '2021 IEEE 46th Conference on Local Computer Networks (LCN)', 'conferenceName': '2021 IEEE 46th Conference on Local Computer Networks (LCN)', 'place': '', 'publisher': '', 'volume': '', 'pages': '347-350', 'series': '', 'language': '', 'DOI': '10.1109/LCN52139.2021.9525022', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9525022', 'accessDate': '2024-09-29T06:43:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 0742-1303', 'tags': [{'tag': 'Autonomous UAV', 'type': 1}, {'tag': 'Buffer overflows', 'type': 1}, {'tag': 'Cruise control', 'type': 1}, {'tag': 'Data collection', 'type': 1}, {'tag': 'Deep reinforcement learning', 'type': 1}, {'tag': 'Packet loss', 'type': 1}, {'tag': 'Reinforcement learning', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'Trajectory planning', 'type': 1}, {'tag': 'UAV-aided WSN', 'type': 1}, {'tag': 'Wireless sensor networks', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:43:12Z', 'dateModified': '2025-01-19T07:17:25Z'}\n"
     ]
    }
   ],
   "source": [
    "# 要保存的数据列表\n",
    "conferencePaper_data = []\n",
    "for item in all_items:\n",
    "    # 只提取会议文章，也就是论文的pdf\n",
    "    if 'itemType' in item['data'] and item['data']['itemType'] == 'conferencePaper':\n",
    "        tags = []\n",
    "        for tag in item['data']['tags']:\n",
    "            tags.append(tag['tag'])\n",
    "        if(item['data']['creators']):\n",
    "            creator = item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName']\n",
    "        else:\n",
    "            creator = 'Unknown Author'\n",
    "        print(item['data'])\n",
    "        conferencePaper_data.append({\n",
    "            'title': item['data']['title'],\n",
    "            'creators': item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName'],\n",
    "            'abstractNote': item['data']['abstractNote'].replace(',', '，'),\n",
    "            'conferenceName': item['data']['conferenceName'],\n",
    "            'date': item['data']['date'],\n",
    "            'language': item['data']['language'],\n",
    "            'url': item['data']['url'],\n",
    "            'libraryCatalog': item['data']['libraryCatalog'],\n",
    "            'tags': tags\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "5845d9851781b8da",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T14:21:18.476408Z",
     "start_time": "2025-07-25T14:21:18.453409Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'title': 'A deep learning based approach for detecting panels in photovoltaic plants',\n",
       "  'creators': 'GrecoAntonio',\n",
       "  'abstractNote': 'Photovoltaic (PV) panels are a clean and widespread way to produce renewable energy from sunlight; at the same time， such plants require maintenance， since solar panels can be affected by many types of damaging factors and have a limited yet variable lifespan. With the impressive growth of such PV installations， it is in the public eye the need of a cheap and effective way to continuously monitor the state of the plants and a standard technique designed to promptly replace broken modules， in order to prevent drops in the energy production. Since the faults mainly appear as Hot Spots on the surface of the PV panels， aerial thermal imaging can be used to diagnose such problems and also locate them in huge plants. To this aim， dedicated automatic Computer Vision methods are able to automatically find hot spots from thermal images， where they appear as white stains. In these methods a fundamental step is the segmentation of the PV panels， which allows to automatically detect each module.',\n",
       "  'conferenceName': 'APPIS 2020: 3rd International Conference on Applications of Intelligent Systems',\n",
       "  'date': '2020-01-07',\n",
       "  'language': 'en',\n",
       "  'url': 'https://dl.acm.org/doi/10.1145/3378184.3378185',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Real-time model base fault diagnosis of PV panels using statistical signal processing',\n",
       "  'creators': 'DavarifarM.',\n",
       "  'abstractNote': 'This paper proposes new method of monitoring and fault detection in photovoltaic systems， based mainly on the analysis of the power losses of the photovoltaic system (PV) by using statistical signal processing. Firstly， real time new universal circuit based model of photovoltaic panels is presented. Then， the development of software fault detection on a real installation is performed under the MATLAB/Simulink environment. With model based fault diagnosis analysis， residual signal from comparing Simulink and real model is generated. To observe clear alarm signal from arbitrary data captured， Wald test technic is applied on residual signal. A model residual based on Sequential Probability Ratio Test (WSPRT) framework for electrical fault diagnosis in PV system is introduced.',\n",
       "  'conferenceName': '2013 International Conference on Renewable Energy Research and Applications (ICRERA)',\n",
       "  'date': '2013-10',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/6749826',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Fault diagnosis',\n",
       "   'Integrated circuit modeling',\n",
       "   'Mathematical model',\n",
       "   'Monitoring',\n",
       "   'PV system',\n",
       "   'Photovoltaic systems',\n",
       "   'Real-time systems',\n",
       "   'Renewable energy sources',\n",
       "   'faults diagnosis',\n",
       "   'real time modeling']},\n",
       " {'title': 'Fault Detection and Diagnosis of Photovoltaic Systems through I-V Curve Analysis',\n",
       "  'creators': 'ZbibBatoul',\n",
       "  'abstractNote': 'This work presents an algorithm to detect and diagnose faults in PhotoVoltaic (PV) systems based on the I-V curve analysis. Three types of faults are investigated: mismatch and shading faults， connectivity faults and short circuit faults. The PV system is modeled using MATLAB/Simulink to simulate the faulty I-V curve behavior for each fault. During each simulation， the I-V curve is examined and compared with that during normal operation (without faults)， in order to identify and characterize the anomalies. The different faulty modes affect the I-V characteristics of the PV string in different ways， leaving distinct signatures during its operation. Four attributes are extracted allowing for classification of faults into five classes. For some classes involving more than one fault， further analysis and comparison is carried out to allow discrimination between them. Results show that the proposed technique has good performance in detecting faults even if they are not severe.',\n",
       "  'conferenceName': '2020 International Conference on Electrical, Communication, and Computer Engineering (ICECCE)',\n",
       "  'date': '2020-06',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9179390',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Circuit faults',\n",
       "   'Fault detection',\n",
       "   'Fault diagnosis',\n",
       "   'I-V curve analysis',\n",
       "   'Integrated circuit modeling',\n",
       "   'Mathematical model',\n",
       "   'PV string',\n",
       "   'Radiation effects',\n",
       "   'Temperature',\n",
       "   'fault detection and diagnosis',\n",
       "   'faults',\n",
       "   'photovoltaic systems',\n",
       "   'symptoms']},\n",
       " {'title': 'Lightweight and Efficient Distributed Photovoltaic Panel Defect Detection Model',\n",
       "  'creators': 'GuYi',\n",
       "  'abstractNote': 'In the detection of defects in distributed photovoltaic (PV) panel， it is crucial to balance the high precision required for defect detection with the practical challenges of deploying models on low-resource devices. To address this challenge， this paper proposes the YOLOv8-PV model based on YOLOv8. Firstly， a shared-parameter detection head is designed to accelerate training and better learn defect features. A lightweight and high-performance PCONV is incorporated into the detection head to enhance higher throughput and lower memory access. Additionally， a Context Guided block is introduced to reduce computational complexity. Finally， an improved MLCA attention mechanism is added to enhance detection accuracy. Experimental results on a distributed photovoltaic panel dataset demonstrate that the YOLOv8-PV model achieves a reduction in Params， GFLOPS， and model size by 50.0%， 60.4%， and 46.6%， respectively， compared to the baseline model. Additionally， the mAP50 is improved by 1.8%.',\n",
       "  'conferenceName': '2024 4th International Conference on Computer Science and Blockchain (CCSB)',\n",
       "  'date': '2024-09',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10735631/',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Computational modeling',\n",
       "   'Defect detection',\n",
       "   'Feature extraction',\n",
       "   'Head',\n",
       "   'Lightweight',\n",
       "   'Object detection',\n",
       "   'Optimization',\n",
       "   'Performance evaluation',\n",
       "   'Photovoltaic panel',\n",
       "   'Photovoltaic systems',\n",
       "   'Throughput',\n",
       "   'Training',\n",
       "   'YOLOv8',\n",
       "   'defect detection']},\n",
       " {'title': \"Run, Don't Walk: Chasing Higher FLOPS for Faster Neural Networks\",\n",
       "  'creators': 'ChenJierun',\n",
       "  'abstractNote': 'To design fast neural networks， many works have been focusing on reducing the number of floating-point operations (FLOPs). We observe that such reduction in FLOPs， however， does not necessarily lead to a similar level of reduction in latency. This mainly stems from inefficiently low floating-point operations per second (FLOPS). To achieve faster networks， we revisit popular operators and demonstrate that such low FLOPS is mainly due to frequent memory access of the operators， especially the depthwise convolution. We hence propose a novel partial convolution (PConv) that extracts spatial features more efficiently， by cutting down redundant computation and memory access simultaneously. Building upon our PConv， we further propose FasterNet， a new family of neural networks， which attains substantially higher running speed than others on a wide range of devices， without compromising on accuracy for various vision tasks. For example， on ImageNet1k， our tiny FasterNet-T0 is 2.8×， 3.3×， and 2.4× faster than MobileViT-XXS on GPU， CPU， and ARM processors， respectively， while being 2.9% more accurate. Our large FasterNet-L achieves impressive 83.5% top-1 accuracy， on par with the emerging Swin-B， while having 36% higher inference throughput on GPU， as well as saving 37% compute time on CPU. Code is available at https://github. com/JierunChen/FasterNet.',\n",
       "  'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '6/2023',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10203371/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'PV-DETR: A Multimodal Fault Detection Model of PV Arrays based on Parallel Block Attention',\n",
       "  'creators': 'ChenWanghu',\n",
       "  'abstractNote': 'Detection of faults in photovoltaic arrays can reduce power generation losses and extend the equipment’s lifespan. Traditional operation and maintenance of photovoltaic power stations primarily rely on electrical characteristics or infrared images. However， data from a single modality are susceptible to environmental interference， affecting detection accuracy. To address these issues， we propose a model called PV-DETR for fault detection in photovoltaic arrays under complex environmental conditions. This model is an extension of RT-DETRv2， which leverages the Transformer architecture for feature extraction and decoding. The model employs a PResNet50 module instead of the original ResNet50， along with haar wavelet downsampling and a parallel block attention mechanism. The PResNet50 module can reduce dimensionality while minimizing information loss. Haar wavelet downsampling retains the original global information and compresses feature maps effectively， and the parallel block attention mechanism significantly enhances the detection of small infrared targets. Experimental results show that the final PV-DETR model achieves an average accuracy of 89% and an average recall of 85% in fault detection using multimodal data， outperforming existing models， including the original RT-DETRv2.',\n",
       "  'conferenceName': '2024 IEEE International Conference on Big Data (BigData)',\n",
       "  'date': '2024-12',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10826130/?arnumber=10826130',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Arrays',\n",
       "   'Computational modeling',\n",
       "   'Data models',\n",
       "   'Fault detection',\n",
       "   'Image coding',\n",
       "   'Maintenance',\n",
       "   'Photovoltaic systems',\n",
       "   'RT-DETRv2',\n",
       "   'Real-time systems',\n",
       "   'Transformers',\n",
       "   'Wavelet transforms',\n",
       "   'fault detection',\n",
       "   'infrared images',\n",
       "   'multimodal',\n",
       "   'photovoltaic array']},\n",
       " {'title': 'AI-Based PV Panels Inspection using an Advanced YOLO Algorithm',\n",
       "  'creators': 'HaerumanAgus',\n",
       "  'abstractNote': 'The rapid growth of solar photovoltaic (PV) systems as green energy sources has gained momentum in recent years. However， the anomalies of PV panel defects can reduce its efficiency and minimize energy harvesting from the plant. The manual inspection of PV panel defects throughout the plant is costly and time-consuming. Thus， implementing more intelligent ways to inspect solar panel defects will provide more benefits than traditional ones. This study presents an implementation of a deep learning model to detect solar panel defects using an advanced object detection algorithm called You Look Only Once， version 7 (YOLOv7). YOLO is a popular algorithm in computer vision for classification and localization. The dataset utilized in this study was sourced from ROBOFLOW， consisting of 1660 infrared images showcasing thermal defects in PV panels. The model was constructed to identify a broader range of images with heterogeneity， leveraging the aforementioned dataset. Following validation， the model demonstrates a mean Average Precision (mAP) of 85.9%. With this accuracy， the model is relevant for real-world applications. This assertion is affirmed by testing the model with additional data from separate video-capturing PV panels. The video was recorded using a drone equipped with a thermal camera.',\n",
       "  'conferenceName': 'Renewable Energy: Generation and Application',\n",
       "  'date': '2024-08-15',\n",
       "  'language': 'en',\n",
       "  'url': 'https://www.mrforum.com/product/9781644903216-30',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Towards Efficient Solar Panel Inspection: A YOLO-based Method for Hotspot Detection',\n",
       "  'creators': 'AmeerdinMuhammad Irshat',\n",
       "  'abstractNote': \"Solar energy that captured by the photovoltaic (PV) cells has gained recognition as an important factor in the global search for sustainable and clean energy sources in recent years. One of the Sustainable Development Goals (SDG) that solar technology directly supports is Affordable and Clean Energy. It can help increase access to clean energy sources by improving the efficiency and dependability of solar panels through minimizing its defects. However， a variety of defects can shorten the lifespan and effectiveness of PV array， which are crucial components of solar energy systems. The study concentrates on detecting hotspots on solar panels， identifiable through thermal imaging technology. This project aims to develop a deep learning-based approach for defect detection of solar panels. The project unfolds with a primary goal， that is designing the integration of a thermal sensor and deep learning to detect and identify defects in PV panels. It follows with crafting a robust algorithm within the deep learning environment for effective defect detection and identification. Next， the algorithm's performance will be evaluated， emphasizing its reliability and accuracy in enhancing defect detection. The process begins with physically examining a solar panel， followed by using a drone-mounted thermal camera to capture thermal images. After obtaining enough data， the images undergo model generation by labelling and annotation process using Roboflow. The model is then tested and trained for defect detection using YOLOv8. Once the desired accuracy is reached， the dataset is formatted. A user-friendly graphical interface is developed for ease of interaction. Then， the system's performance is evaluated using a confusion matrix to gauge the effectiveness of the defect detection approach. The panel's defect will be confirmed with the manual inspection. Based on the early result obtained， the model's confidence level that has been acquired is 76%.\",\n",
       "  'conferenceName': '2024 IEEE 14th Symposium on Computer Applications & Industrial Electronics (ISCAIE)',\n",
       "  'date': '2024-05',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/10576312',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Accuracy',\n",
       "   'Deep Learning',\n",
       "   'Deep learning',\n",
       "   'Hotspots',\n",
       "   'Refining',\n",
       "   'Roboflow',\n",
       "   'Solar Panel',\n",
       "   'Solar energy',\n",
       "   'System performance',\n",
       "   'Thermal sensors',\n",
       "   'Training',\n",
       "   'YOLO']},\n",
       " {'title': 'Infrared Thermography Based Hotspot Detection Of Photovoltaic Module using YOLO',\n",
       "  'creators': 'TajwarTahmid',\n",
       "  'abstractNote': 'Regarding clean energy production high curiosity is gained by Solar Photovoltaic (PV) worldwide. Faults in the PV modules cause significant issues for the PV systems. Detecting faults of PV modules could help to take the necessary measures. This study uses Infrared thermography (IRT) to detect the hotspot of PV modules. The objective is to develop a hotspot detection tool using ‘YOLO: You Only Look once.’ The images are converted into a data set for a classifier to detect the hotspot of PV modules. Then the learner is trained and tested with the dataset. After that， the output validates with the IRT images of PV modules. The outcome of this study is to apply a real-time object detection tool identifying the defect of the PV module. The result shows that with a more diversified data set， the confidence of detecting the hotspot increases.',\n",
       "  'conferenceName': '2021 IEEE 12th Energy Conversion Congress & Exposition - Asia (ECCE-Asia)',\n",
       "  'date': '2021-05',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9478998/?arnumber=9478998',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Asia',\n",
       "   'Condition monitoring',\n",
       "   'Detectors',\n",
       "   'Infrared thermography',\n",
       "   'Object detection',\n",
       "   'Photovoltaic systems',\n",
       "   'Production',\n",
       "   'Tools',\n",
       "   'YOLO',\n",
       "   'hotspot',\n",
       "   'machine learning',\n",
       "   'photovoltaic']},\n",
       " {'title': 'Fault Detection of the Solar Photovoltaic Modules Using YOLO Models',\n",
       "  'creators': 'MalikParveen',\n",
       "  'abstractNote': 'The growing adoption of solar panels， driven by climate change concerns， underscores the importance of ensuring the reliability of photovoltaic (PV) modules. However， outdoor PV modules deployment face a range of environmental challenges such as extreme temperatures， chemical exposure， and mechanical stress which can lead to aging， defects， and degradation. This research introduces a novel approach for identifying faults in solar photovoltaic (PV) modules. Leveraging deep learning techniques from the You Only Look Once (YOLO) family， specifically the recent YOLOv8 and YOLOv9 models， this paper aims to enhance the reliability and performance of PV systems by accurately detecting and classifying module defects to a thermal images database containing three photo-voltaic cell defects. By automating the fault detection process through computer vision， this work contributes to the ongoing efforts to optimize solar energy generation and maintenance. Further， YOLOv5， YOLOv6， and YOLOv7 are also trained， validated， and tested. The results showed that the novel technique of the GELAN architecture-based model outperformed all other models trained on the custom dataset of thermal images of solar PV modules， achieving a mean average precision (mAP) of 70.4%.',\n",
       "  'conferenceName': '2024 IEEE Region 10 Symposium (TENSYMP)',\n",
       "  'date': '2024-09',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10752194',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Computational modeling',\n",
       "   'Fault detection',\n",
       "   'Image databases',\n",
       "   'Maintenance',\n",
       "   'Microprocessors',\n",
       "   'Photo Voltaic',\n",
       "   'Solar energy',\n",
       "   'Solar panels',\n",
       "   'Stress',\n",
       "   'Temperature distribution',\n",
       "   'YOLO',\n",
       "   'generalized efficient layer aggregation network',\n",
       "   'infrared thermography',\n",
       "   'mAP',\n",
       "   'object detection',\n",
       "   'solar cell']},\n",
       " {'title': 'Efficient Multi-Scale Attention Module with Cross-Spatial Learning',\n",
       "  'creators': 'OuyangDaliang',\n",
       "  'abstractNote': 'Remarkable effectiveness of the channel or spatial attention mechanisms for producing more discernible feature representation are illustrated in various computer vision tasks. However， modeling the cross-channel relationships with channel dimensionality reduction may bring side effect in extracting deep visual representations. In this paper， a novel efficient multi-scale attention (EMA) module is proposed. Focusing on retaining the information on per channel and decreasing the computational overhead， we reshape the partly channels into the batch dimensions and group the channel dimensions into multiple sub-features which make the spatial semantic features well-distributed inside each feature group. Specifically， apart from encoding the global information to re-calibrate the channel-wise weight in each parallel branch， the output features of the two parallel branches are further aggregated by a cross-dimension interaction for capturing pixel-level pairwise relationship. We conduct extensive ablation studies and experiments on image classification and object detection tasks with popular benchmarks (e.g.， CIFAR-100， ImageNet-1k， MS COCO and VisDrone2019) for evaluating its performance.',\n",
       "  'conferenceName': '',\n",
       "  'date': '2023-6-4',\n",
       "  'language': '',\n",
       "  'url': 'http://arxiv.org/abs/2305.13563',\n",
       "  'libraryCatalog': 'arXiv.org',\n",
       "  'tags': ['Computer Science - Artificial Intelligence',\n",
       "   'Computer Science - Computer Vision and Pattern Recognition']},\n",
       " {'title': 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks',\n",
       "  'creators': 'TanMingxing',\n",
       "  'abstractNote': 'Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget， and then scaled up for better accuracy if more resources are given. In this paper， we systematically study model scaling and identify that carefully balancing network depth， width， and resolution can lead to better performance. Based on this observation， we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further， we use neural architecture search to design a new baseline network and scale it up to obtain a family of models， called EfficientNets， which achieve much better accuracy and efficiency than previous ConvNets. In particular， our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet， while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al.， 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%)， Flower (98.8%)， and 3 other transfer learning datasets， with an order of magnitude fewer parameters.',\n",
       "  'conferenceName': 'International Conference on Machine Learning',\n",
       "  'date': '2019-05-24',\n",
       "  'language': 'en',\n",
       "  'url': 'https://proceedings.mlr.press/v97/tan19a.html',\n",
       "  'libraryCatalog': 'proceedings.mlr.press',\n",
       "  'tags': []},\n",
       " {'title': 'DETRs Beat YOLOs on Real-time Object Detection',\n",
       "  'creators': 'ZhaoYian',\n",
       "  'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However， we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently， end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless， the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper， we propose the Real-Time DEtection TRansformer (RT-DETR)， the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps， drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed， followed by maintaining speed while improving accuracy. Specifically， we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then， we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder， thereby improving accuracy. In addition， RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU， outperforming previously advanced YOLOs in both speed and accuracy. Furthermore， RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365， RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.',\n",
       "  'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '2024-6-16',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10657220/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'DETRs Beat YOLOs on Real-time Object Detection',\n",
       "  'creators': 'ZhaoYian',\n",
       "  'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However， we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently， end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless， the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper， we propose the Real-Time DEtection TRansformer (RT-DETR)， the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps， drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed， followed by maintaining speed while improving accuracy. Specifically， we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then， we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder， thereby improving accuracy. In addition， RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU， outperforming previously advanced YOLOs in both speed and accuracy. Furthermore， RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365， RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.',\n",
       "  'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '2024-6-16',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10657220/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Strip Pooling: Rethinking Spatial Pooling for Scene Parsing',\n",
       "  'creators': 'HouQibin',\n",
       "  'abstractNote': 'Spatial pooling has been proven highly effective in capturing long-range contextual information for pixel-wise prediction tasks， such as scene parsing. In this paper， beyond conventional spatial pooling that usually has a regular shape of N × N ， we rethink the formulation of spatial pooling by introducing a new pooling strategy， called strip pooling， which considers a long but narrow kernel， i.e.， 1 × N or N × 1. Based on strip pooling， we further investigate spatial pooling architecture design by 1) introducing a new strip pooling module that enables backbone networks to efﬁciently model long-range dependencies， 2) presenting a novel building block with diverse spatial pooling as a core， and 3) systematically comparing the performance of the proposed strip pooling and conventional spatial pooling techniques. Both novel pooling-based designs are lightweight and can serve as an efﬁcient plugand-play module in existing scene parsing networks. Extensive experiments on popular benchmarks (e.g.， ADE20K and Cityscapes) demonstrate that our simple approach establishes new state-of-the-art results. Code is available at https://github.com/Andrew-Qibin/SPNet.',\n",
       "  'conferenceName': '2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '6/2020',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9157204/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Oriented R-CNN for Object Detection',\n",
       "  'creators': 'XieXingxing',\n",
       "  'abstractNote': '',\n",
       "  'conferenceName': 'Proceedings of the IEEE/CVF International Conference on Computer Vision',\n",
       "  'date': '2021',\n",
       "  'language': 'en',\n",
       "  'url': 'https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Oriented_R-CNN_for_Object_Detection_ICCV_2021_paper.html',\n",
       "  'libraryCatalog': 'openaccess.thecvf.com',\n",
       "  'tags': []},\n",
       " {'title': 'ImageNet Classification with Deep Convolutional Neural Networks',\n",
       "  'creators': 'KrizhevskyAlex',\n",
       "  'abstractNote': 'We trained a large， deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data， we achieved top-1 and top-5 error rates of 39.7\\\\% and 18.9\\\\% which is considerably better than the previous state-of-the-art results. The neural network， which has 60 million parameters and 500，000 neurons， consists of five convolutional layers， some of which are followed by max-pooling layers， and two globally connected layers with a final 1000-way softmax. To make training faster， we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.',\n",
       "  'conferenceName': '',\n",
       "  'date': '2012',\n",
       "  'language': '',\n",
       "  'url': 'https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html',\n",
       "  'libraryCatalog': 'Neural Information Processing Systems',\n",
       "  'tags': []},\n",
       " {'title': 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows',\n",
       "  'creators': 'LiuZe',\n",
       "  'abstractNote': 'This paper presents a new vision Transformer， called Swin Transformer， that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains， such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences， we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efﬁciency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the ﬂexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks， including image classiﬁcation (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO testdev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-theart by a large margin of +2.7 box AP and +2.6 mask AP on COCO， and +3.2 mIoU on ADE20K， demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneﬁcial for all-MLP architectures. The code and models are publicly available at https://github. com/microsoft/Swin-Transformer.',\n",
       "  'conferenceName': '2021 IEEE/CVF International Conference on Computer Vision (ICCV)',\n",
       "  'date': '10/2021',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9710580/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'Deep Residual Learning for Image Recognition',\n",
       "  'creators': 'HeKaiming',\n",
       "  'abstractNote': 'Deeper neural networks are more difﬁcult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs， instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize， and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers—8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classiﬁcation task. We also present analysis on CIFAR-10 with 100 and 1000 layers.',\n",
       "  'conferenceName': '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '6/2016',\n",
       "  'language': 'en',\n",
       "  'url': 'http://ieeexplore.ieee.org/document/7780459/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': []},\n",
       " {'title': 'SCConv: Spatial and Channel Reconstruction Convolution for Feature Redundancy',\n",
       "  'creators': 'LiJiafeng',\n",
       "  'abstractNote': 'Convolutional Neural Networks (CNNs) have achieved remarkable performance in various computer vision tasks but this comes at the cost of tremendous computational resources， partly due to convolutional layers extracting redundant features. Recent works either compress well-trained large-scale models or explore well-designed lightweight models. In this paper， we make an attempt to exploit spatial and channel redundancy among features for CNN compression and propose an efficient convolution module， called SCConv (Spatial and Channel reconstruction Convolution)， to decrease redundant computing and facilitate representative feature learning. The proposed SCConv consists of two units: spatial reconstruction unit (SRU) and channel reconstruction unit (CRU). SRU utilizes a separate-and-reconstruct method to suppress the spatial redundancy while CRU uses a split-transform-andfuse strategy to diminish the channel redundancy. In addition， SCConv is a plug-and-play architectural unit that can be used to replace standard convolution in various convolutional neural networks directly. Experimental results show that SCConv-embedded models are able to achieve better performance by reducing redundant features with significantly lower complexity and computational costs.',\n",
       "  'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)',\n",
       "  'date': '6/2023',\n",
       "  'language': 'en',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/10204928/',\n",
       "  'libraryCatalog': 'DOI.org (Crossref)',\n",
       "  'tags': ['important-model']},\n",
       " {'title': 'Traffic signal coordination for emergency vehicles',\n",
       "  'creators': 'KangWenwen',\n",
       "  'abstractNote': 'Reducing travel time of emergency vehicles (EVs) has a potential in significant savings of life and property. Integrating modern intelligent transportation system (ITS) with EV signal preemption seems to be a solution. But existing EV signal preemption systems often break the current signal coordination and impact a lot on the normal traffic streams. In this paper we propose an emergency vehicle signal coordination (EVSC) approach， which is intended to provide “green wave” for EVs. Traffic simulations are conducted along an emergency corridor with 8 intersections in Qingdao， China. Multiple traffic measurements are compared between simulation outputs with and without EVSC operation. The result indicates that the proposed approach can reduce EV travel time by 26.9% without too much negative impact on the normal traffic streams.',\n",
       "  'conferenceName': '17th International IEEE Conference on Intelligent Transportation Systems (ITSC)',\n",
       "  'date': '2014-10',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/6957683',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Cities and towns',\n",
       "   'Delays',\n",
       "   'Mathematical model',\n",
       "   'Roads',\n",
       "   'Traffic control',\n",
       "   'Vehicles']},\n",
       " {'title': 'Optimal Motion Control for Connected and Automated Electric Vehicles at Signal-Free Intersections',\n",
       "  'creators': 'PanXiao',\n",
       "  'abstractNote': 'Traffic congestion is one of the major issues for urban traffic networks. The connected and autonomous vehicles (CAV) is an emerging technology that has the potential to address this issue by improving safety， efficiency， and capacity of the transportation system. In this paper， the problem of optimal trajectory planning of battery-electric CAVs in the context of cooperative crossing of an unsignalized intersection is addressed. An optimization-based centralized intersection controller is proposed to find the optimal velocity trajectory of each vehicle so as to minimize electric energy consumption and traffic throughput. Solving the underlying optimization problem for a group of CAVs is not straightforward because of the nonlinear and nonconvex dynamics， especially when the powertrain model is explicitly modelled. In order to ensure a rapid solution search and a unique global optimum， the optimal control problem (OCP) is reformulated via convex modeling techniques. Several simulation case studies show the effectiveness of the proposed approach and the trade-off between energy consumption and traffic throughput is illustrated.',\n",
       "  'conferenceName': '2020 59th IEEE Conference on Decision and Control (CDC)',\n",
       "  'date': '2020-12',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9304392',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Batteries',\n",
       "   'Energy consumption',\n",
       "   'Mechanical power transmission',\n",
       "   'Safety',\n",
       "   'Torque',\n",
       "   'Trajectory',\n",
       "   'Vehicle dynamics']},\n",
       " {'title': 'Latency-Robust Control of High-Speed Signal-Free Intersections',\n",
       "  'creators': 'LiuYang',\n",
       "  'abstractNote': \"High-speed signal-free intersections are a novel urban traffic operation enabled by connected and autonomous vehicles. However， the impact of communication latency on intersection performance has not been well understood. In this paper， we consider vehicle coordination at signal-free intersections with latency. We focus on two questions: (i) how to ensure latency-resiliency of the coordination algorithm， and (ii) how latency affects the intersection's capacity. We consider a trajectory-based model with bounded speed uncertainties. Latency leads to uncertain state observation. We propose a piecewise-linear control law that ensures safety (avoidance of interference) as long as the initial condition is safe. We also analytically quantify the throughput that the proposed control can attain in the face of latency.\",\n",
       "  'conferenceName': '2021 American Control Conference (ACC)',\n",
       "  'date': '2021-05',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/abstract/document/9482689',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Autonomous vehicles',\n",
       "   'Faces',\n",
       "   'Interference',\n",
       "   'Safety',\n",
       "   'Signal-free intersections',\n",
       "   'Throughput',\n",
       "   'Uncertainty',\n",
       "   'connected and autonomous vehicles',\n",
       "   'robust control']},\n",
       " {'title': 'Development of Autonomous Drones for Adaptive Obstacle Avoidance in Real World Environments',\n",
       "  'creators': 'DevosArne',\n",
       "  'abstractNote': 'Recently， drones have been involved in several critical tasks such as infrastructure inspection， crisis response， and search and rescue operations. Such drones mostly use sophisticated computer vision techniques to effectively avoid obstacles and， thereby， require high computational power. Therefore， this work tuned and tested a computationally inexpensive algorithm， previously developed by the authors， for adaptive obstacle avoidance control of a drone. The algorithm aims at protecting the drone from entering in complex situations such as deadlocks and corners. The algorithm has been validated through simulation and implemented on a newly developed drone platform for infrastructure inspection. The design of the drone platform and the experimental results are presented in this study.',\n",
       "  'conferenceName': '2018 21st Euromicro Conference on Digital System Design (DSD)',\n",
       "  'date': '2018-08',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/8491889',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Adaptive obstacle avoidance',\n",
       "   'Autonomous drone system',\n",
       "   'Collision avoidance',\n",
       "   'Drones',\n",
       "   'Implementation',\n",
       "   'Laser radar',\n",
       "   'Navigation',\n",
       "   'Propellers',\n",
       "   'Signal processing algorithms',\n",
       "   'Simulation',\n",
       "   'System recovery']},\n",
       " {'title': 'Deep Reinforcement Learning for Persistent Cruise Control in UAV-aided Data Collection',\n",
       "  'creators': 'KurunathanHarrison',\n",
       "  'abstractNote': 'Autonomous UAV cruising is gaining attention due to its flexible deployment in remote sensing， surveillance， and reconnaissance. A critical challenge in data collection with the autonomous UAV is the buffer overflows at the ground sensors and packet loss due to lossy airborne channels. Trajectory planning of the UAV is vital to alleviate buffer overflows as well as channel fading. In this work， we propose a Deep Deterministic Policy Gradient based Cruise Control (DDPG-CC) to reduce the overall packet loss through online training of headings and cruise velocity of the UAV， as well as the selection of the ground sensors for data collection. Preliminary performance evaluation demonstrates that DDPG-CC reduces the packet loss rate by under 5% when sufficient training is provided to the UAV.',\n",
       "  'conferenceName': '2021 IEEE 46th Conference on Local Computer Networks (LCN)',\n",
       "  'date': '2021-10',\n",
       "  'language': '',\n",
       "  'url': 'https://ieeexplore.ieee.org/document/9525022',\n",
       "  'libraryCatalog': 'IEEE Xplore',\n",
       "  'tags': ['Autonomous UAV',\n",
       "   'Buffer overflows',\n",
       "   'Cruise control',\n",
       "   'Data collection',\n",
       "   'Deep reinforcement learning',\n",
       "   'Packet loss',\n",
       "   'Reinforcement learning',\n",
       "   'Training',\n",
       "   'Trajectory planning',\n",
       "   'UAV-aided WSN',\n",
       "   'Wireless sensor networks']}]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conferencePaper_data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "435db678-938e-444d-a7da-fc762d231ebf",
   "metadata": {},
   "source": [
    "## 分别保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "825ec302-87c7-44a4-bf13-50cc4304632a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "aa08ed42-58e0-422d-be3c-3ddf0faa4185",
   "metadata": {},
   "outputs": [],
   "source": [
    "journalArticle_fields = [\n",
    "    'title',\n",
    "    'creators',\n",
    "    'abstractNote',\n",
    "    'publicationTitle',\n",
    "    'date',\n",
    "    'language',\n",
    "    'url',\n",
    "    'libraryCatalog',\n",
    "    'tags'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "809911cd-fab0-442f-a5be-01f0da0c5486",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV 文件已生成: ../dataset/journal_item_0725.csv\n"
     ]
    }
   ],
   "source": [
    "# 指定要保存的文件名\n",
    "filename = '../dataset/journal_item_0725.csv'\n",
    "\n",
    "# 打开文件并写入数据\n",
    "with open(filename, 'w', newline='', encoding='utf-8') as csvfile:\n",
    "    writer = csv.DictWriter(csvfile, fieldnames=journalArticle_fields)\n",
    "    # 写入表头\n",
    "    writer.writeheader()\n",
    "    # 写入数据\n",
    "    writer.writerows(journalArticle_data)\n",
    "\n",
    "print('CSV 文件已生成:', filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "1d6c0e2f-086c-4269-b13e-c44d9b5f968f",
   "metadata": {},
   "outputs": [],
   "source": [
    "preprint_fields = [\n",
    "    'title',\n",
    "    'creators',\n",
    "    'abstractNote',\n",
    "    'date',\n",
    "    'language',\n",
    "    'url',\n",
    "    'libraryCatalog',\n",
    "    'tags'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "a2b5f946-6199-462f-b964-6e1e1e3e6091",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV 文件已生成: ../dataset/preprint_item_0725.csv\n"
     ]
    }
   ],
   "source": [
    "# 指定要保存的文件名\n",
    "filename = '../dataset/preprint_item_0725.csv'\n",
    "\n",
    "# 打开文件并写入数据\n",
    "with open(filename, 'w', newline='', encoding='utf-8') as csvfile:\n",
    "    writer = csv.DictWriter(csvfile, fieldnames=preprint_fields)\n",
    "    # 写入表头\n",
    "    writer.writeheader()\n",
    "    # 写入数据\n",
    "    writer.writerows(preprint_data)\n",
    "\n",
    "print('CSV 文件已生成:', filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "31c91aa4-292c-4324-9e1f-2a7fafcc578f",
   "metadata": {},
   "outputs": [],
   "source": [
    "conferencePaper_fields = [\n",
    "    'title',\n",
    "    'creators',\n",
    "    'abstractNote',\n",
    "    'conferenceName',\n",
    "    'date',\n",
    "    'language',\n",
    "    'url',\n",
    "    'libraryCatalog',\n",
    "    'tags'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "7eda63f5-209c-4ff8-bde9-a5c839488d59",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV 文件已生成: ../dataset/conference_item_0725.csv\n"
     ]
    }
   ],
   "source": [
    "# 指定要保存的文件名\n",
    "filename = '../dataset/conference_item_0725.csv'\n",
    "\n",
    "# 打开文件并写入数据\n",
    "with open(filename, 'w', newline='', encoding='utf-8') as csvfile:\n",
    "    writer = csv.DictWriter(csvfile, fieldnames=conferencePaper_fields)\n",
    "    # 写入表头\n",
    "    writer.writeheader()\n",
    "    # 写入数据\n",
    "    writer.writerows(conferencePaper_data)\n",
    "\n",
    "print('CSV 文件已生成:', filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff174feb-ae7c-4081-822e-ee61be5fac9a",
   "metadata": {},
   "source": [
    "## 一起保存（取交集）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ce07a40d-6f06-49e2-a571-80258ca5ca80",
   "metadata": {},
   "outputs": [],
   "source": [
    "fields = [\n",
    "    'title',\n",
    "    'creators',\n",
    "    'abstractNote',\n",
    "    'publicationTitle',\n",
    "    'date',\n",
    "    'language',\n",
    "    'url',\n",
    "    'libraryCatalog',\n",
    "    'tags'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "650a8666-9884-45fb-ae91-313ef71ff342",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'key': 'CQFLFXWJ', 'version': 5227, 'itemType': 'preprint', 'title': 'Denoising Diffusion Probabilistic Models', 'creators': [{'creatorType': 'author', 'firstName': 'Jonathan', 'lastName': 'Ho'}, {'creatorType': 'author', 'firstName': 'Ajay', 'lastName': 'Jain'}, {'creatorType': 'author', 'firstName': 'Pieter', 'lastName': 'Abbeel'}], 'abstractNote': 'We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at https://github.com/hojonathanho/diffusion', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2006.11239', 'place': '', 'date': '2020-12-16', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2006.11239', 'citationKey': '', 'url': 'http://arxiv.org/abs/2006.11239', 'accessDate': '2025-07-22T05:28:21Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2006.11239 [cs]', 'tags': [{'tag': 'Computer Science - Machine Learning', 'type': 1}, {'tag': 'Statistics - Machine Learning', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-07-22T05:28:21Z', 'dateModified': '2025-07-22T05:28:26Z'}\n",
      "{'key': 'XX6M2B9B', 'version': 5221, 'itemType': 'preprint', 'title': 'StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements', 'creators': [{'creatorType': 'author', 'firstName': 'Mingkun', 'lastName': 'Lei'}, {'creatorType': 'author', 'firstName': 'Xue', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Beier', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Chi', 'lastName': 'Zhang'}], 'abstractNote': 'Text-driven style transfer aims to merge the style of a reference image with content described by a text prompt. Recent advancements in text-to-image models have improved the nuance of style transformations, yet significant challenges remain, particularly with overfitting to reference styles, limiting stylistic control, and misaligning with textual content. In this paper, we propose three complementary strategies to address these issues. First, we introduce a cross-modal Adaptive Instance Normalization (AdaIN) mechanism for better integration of style and text features, enhancing alignment. Second, we develop a Style-based Classifier-Free Guidance (SCFG) approach that enables selective control over stylistic elements, reducing irrelevant influences. Finally, we incorporate a teacher model during early generation stages to stabilize spatial layouts and mitigate artifacts. Our extensive evaluations demonstrate significant improvements in style transfer quality and alignment with textual prompts. Furthermore, our approach can be integrated into existing style transfer frameworks without fine-tuning.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.08503', 'place': '', 'date': '2025-03-27', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.08503', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.08503', 'accessDate': '2025-07-22T03:48:08Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'StyleStudio', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.08503 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:48:08Z', 'dateModified': '2025-07-22T03:48:08Z'}\n",
      "{'key': 'ZF7EA985', 'version': 5218, 'itemType': 'preprint', 'title': 'Efficient Diffusion as Low Light Enhancer', 'creators': [{'creatorType': 'author', 'firstName': 'Guanzhou', 'lastName': 'Lan'}, {'creatorType': 'author', 'firstName': 'Qianli', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Yuqi', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Zhigang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Dong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xuelong', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Zhao'}], 'abstractNote': 'The computational burden of the iterative sampling process remains a major challenge in diffusion-based Low-Light Image Enhancement (LLIE). Current acceleration methods, whether training-based or training-free, often lead to significant performance degradation, highlighting the trade-off between performance and efficiency. In this paper, we identify two primary factors contributing to performance degradation: fitting errors and the inference gap. Our key insight is that fitting errors can be mitigated by linearly extrapolating the incorrect score functions, while the inference gap can be reduced by shifting the Gaussian flow to a reflectance-aware residual space. Based on the above insights, we design Reflectance-Aware Trajectory Refinement (RATR) module, a simple yet effective module to refine the teacher trajectory using the reflectance component of images. Following this, we introduce \\\\textbf{Re}flectance-aware \\\\textbf{D}iffusion with \\\\textbf{Di}stilled \\\\textbf{T}rajectory (\\\\textbf{ReDDiT}), an efficient and flexible distillation framework tailored for LLIE. Our framework achieves comparable performance to previous diffusion-based methods with redundant steps in just 2 steps while establishing new state-of-the-art (SOTA) results with 8 or 4 steps. Comprehensive experimental evaluations on 10 benchmark datasets validate the effectiveness of our method, consistently outperforming existing SOTA methods.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2410.12346', 'place': '', 'date': '2024-11-21', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2410.12346', 'citationKey': '', 'url': 'http://arxiv.org/abs/2410.12346', 'accessDate': '2025-07-22T03:47:32Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2410.12346 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:47:32Z', 'dateModified': '2025-07-22T03:47:32Z'}\n",
      "{'key': 'N39JB9HL', 'version': 5216, 'itemType': 'preprint', 'title': 'HVI: A New Color Space for Low-light Image Enhancement', 'creators': [{'creatorType': 'author', 'firstName': 'Qingsen', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Yixu', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Cheng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Guansong', 'lastName': 'Pang'}, {'creatorType': 'author', 'firstName': 'Kangbiao', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Peng', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Jinqiu', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Yanning', 'lastName': 'Zhang'}], 'abstractNote': 'Low-Light Image Enhancement (LLIE) is a crucial computer vision task that aims to restore detailed visual information from corrupted low-light images. Many existing LLIE methods are based on standard RGB (sRGB) space, which often produce color bias and brightness artifacts due to inherent high color sensitivity in sRGB. While converting the images using Hue, Saturation and Value (HSV) color space helps resolve the brightness issue, it introduces significant red and black noise artifacts. To address this issue, we propose a new color space for LLIE, namely Horizontal/Vertical-Intensity (HVI), defined by polarized HS maps and learnable intensity. The former enforces small distances for red coordinates to remove the red artifacts, while the latter compresses the low-light regions to remove the black artifacts. To fully leverage the chromatic and intensity information, a novel Color and Intensity Decoupling Network (CIDNet) is further introduced to learn accurate photometric mapping function under different lighting conditions in the HVI space. Comprehensive results from benchmark and ablation experiments show that the proposed HVI color space with CIDNet outperforms the state-of-the-art methods on 10 datasets. The code is available at https://github.com/Fediory/HVI-CIDNet.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2502.20272', 'place': '', 'date': '2025-02-28', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2502.20272', 'citationKey': '', 'url': 'http://arxiv.org/abs/2502.20272', 'accessDate': '2025-07-22T03:47:07Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'HVI', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2502.20272 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:47:07Z', 'dateModified': '2025-07-22T03:47:07Z'}\n",
      "{'key': 'VM4C3ZBT', 'version': 5214, 'itemType': 'preprint', 'title': 'MonSter: Marry Monodepth to Stereo Unleashes Power', 'creators': [{'creatorType': 'author', 'firstName': 'Junda', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Longliang', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Gangwei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Xianqi', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Zhaoxing', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yong', 'lastName': 'Deng'}, {'creatorType': 'author', 'firstName': 'Jinliang', 'lastName': 'Zang'}, {'creatorType': 'author', 'firstName': 'Yurui', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhipeng', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Yang'}], 'abstractNote': 'Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues, such as occlusions and textureless areas. To address this, we propose MonSter, a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-branch architecture to iteratively improve each other. Confidence-based guidance adaptively selects reliable stereo cues for monodepth scale-shift recovery. The refined monodepth is in turn guides stereo effectively at ill-posed regions. Such iterative mutual enhancement enables MonSter to evolve monodepth priors from coarse object-level structures to pixel-level geometry, fully unlocking the potential of stereo matching. As shown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards -- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to 49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. Comprehensive analysis verifies the effectiveness of MonSter in ill-posed regions. In terms of zero-shot generalization, MonSter significantly and consistently outperforms state-of-the-art across the board. The code is publicly available at: https://github.com/Junda24/MonSter.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.08643', 'place': '', 'date': '2025-01-15', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.08643', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.08643', 'accessDate': '2025-07-22T03:43:53Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'MonSter', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.08643 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2025-07-22T03:43:53Z', 'dateModified': '2025-07-22T03:43:53Z'}\n",
      "{'key': 'HB4Q7FPV', 'version': 5207, 'itemType': 'preprint', 'title': 'MonSter: Marry Monodepth to Stereo Unleashes Power', 'creators': [{'creatorType': 'author', 'firstName': 'Junda', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Longliang', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Gangwei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Xianqi', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Zhaoxing', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yong', 'lastName': 'Deng'}, {'creatorType': 'author', 'firstName': 'Jinliang', 'lastName': 'Zang'}, {'creatorType': 'author', 'firstName': 'Yurui', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhipeng', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Yang'}], 'abstractNote': 'Stereo matching recovers depth from image correspondences. Existing methods struggle to handle ill-posed regions with limited matching cues, such as occlusions and textureless areas. To address this, we propose MonSter, a novel method that leverages the complementary strengths of monocular depth estimation and stereo matching. MonSter integrates monocular depth and stereo matching into a dual-branch architecture to iteratively improve each other. Confidence-based guidance adaptively selects reliable stereo cues for monodepth scale-shift recovery. The refined monodepth is in turn guides stereo effectively at ill-posed regions. Such iterative mutual enhancement enables MonSter to evolve monodepth priors from coarse object-level structures to pixel-level geometry, fully unlocking the potential of stereo matching. As shown in Fig.1, MonSter ranks 1st across five most commonly used leaderboards -- SceneFlow, KITTI 2012, KITTI 2015, Middlebury, and ETH3D. Achieving up to 49.5% improvements (Bad 1.0 on ETH3D) over the previous best method. Comprehensive analysis verifies the effectiveness of MonSter in ill-posed regions. In terms of zero-shot generalization, MonSter significantly and consistently outperforms state-of-the-art across the board. The code is publicly available at: https://github.com/Junda24/MonSter.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.08643', 'place': '', 'date': '2025-01-15', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.08643', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.08643', 'accessDate': '2025-07-22T03:43:27Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'MonSter', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.08643 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:43:27Z', 'dateModified': '2025-07-22T03:43:27Z'}\n",
      "{'key': 'H4K3UIJG', 'version': 5205, 'itemType': 'preprint', 'title': 'DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos', 'creators': [{'creatorType': 'author', 'firstName': 'Wenbo', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Xiangjun', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Xiaoyu', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Sijie', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Xiaodong', 'lastName': 'Cun'}, {'creatorType': 'author', 'firstName': 'Yong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Long', 'lastName': 'Quan'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Shan'}], 'abstractNote': 'Estimating video depth in open-world scenarios is challenging due to the diversity of videos in appearance, content motion, camera movement, and length. We present DepthCrafter, an innovative method for generating temporally consistent long depth sequences with intricate details for open-world videos, without requiring any supplementary information such as camera poses or optical flow. The generalization ability to open-world videos is achieved by training the video-to-depth model from a pre-trained image-to-video diffusion model, through our meticulously designed three-stage training strategy. Our training approach enables the model to generate depth sequences with variable lengths at one time, up to 110 frames, and harvest both precise depth details and rich content diversity from realistic and synthetic datasets. We also propose an inference strategy that can process extremely long videos through segment-wise estimation and seamless stitching. Comprehensive evaluations on multiple datasets reveal that DepthCrafter achieves state-of-the-art performance in open-world video depth estimation under zero-shot settings. Furthermore, DepthCrafter facilitates various downstream applications, including depth-based visual effects and conditional video generation.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2409.02095', 'place': '', 'date': '2024-11-27', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2409.02095', 'citationKey': '', 'url': 'http://arxiv.org/abs/2409.02095', 'accessDate': '2025-07-22T03:42:58Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DepthCrafter', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2409.02095 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Graphics', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:42:58Z', 'dateModified': '2025-07-22T03:42:58Z'}\n",
      "{'key': 'FZUAD9KW', 'version': 5202, 'itemType': 'preprint', 'title': 'Universal Actions for Enhanced Embodied Foundation Models', 'creators': [{'creatorType': 'author', 'firstName': 'Jinliang', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Jianxiong', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Dongxiu', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yinan', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Zhihao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Zhonghong', 'lastName': 'Ou'}, {'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jingjing', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Ya-Qin', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xianyuan', 'lastName': 'Zhan'}], 'abstractNote': 'Training on diverse, internet-scale data is a key factor in the success of recent large foundation models. Yet, using the same recipe for building embodied agents has faced noticeable difficulties. Despite the availability of many crowd-sourced embodied datasets, their action spaces often exhibit significant heterogeneity due to distinct physical embodiment and control interfaces for different robots, causing substantial challenges in developing embodied foundation models using cross-domain data. In this paper, we introduce UniAct, a new embodied foundation modeling framework operating in a Universal Action Space. Our learned universal actions capture the generic atomic behaviors across diverse robots by exploiting their shared structural features, and enable enhanced cross-domain data utilization and cross-embodiment generalizations by eliminating the notorious heterogeneity. The universal actions can be efficiently translated back to heterogeneous actionable commands by simply adding embodiment-specific details, from which fast adaptation to new robots becomes simple and straightforward. Our 0.5B instantiation of UniAct outperforms 14X larger SOTA embodied foundation models in extensive evaluations on various real-world and simulation robots, showcasing exceptional cross-embodiment control and adaptation capability, highlighting the crucial benefit of adopting universal actions. Project page: https://github.com/2toinf/UniAct', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.10105', 'place': '', 'date': '2025-03-08', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.10105', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.10105', 'accessDate': '2025-07-22T03:39:41Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.10105 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Robotics', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:39:41Z', 'dateModified': '2025-07-22T03:39:41Z'}\n",
      "{'key': 'WJ5QTVPG', 'version': 5198, 'itemType': 'preprint', 'title': 'Number it: Temporal Grounding Videos like Flipping Manga', 'creators': [{'creatorType': 'author', 'firstName': 'Yongliang', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Xinting', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Yuyang', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Yizhou', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Wenbo', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Fengyun', 'lastName': 'Rao'}, {'creatorType': 'author', 'firstName': 'Bernt', 'lastName': 'Schiele'}, {'creatorType': 'author', 'firstName': 'Xu', 'lastName': 'Yang'}], 'abstractNote': 'Video Large Language Models (Vid-LLMs) have made remarkable advancements in comprehending video content for QA dialogue. However, they struggle to extend this visual understanding to tasks requiring precise temporal localization, known as Video Temporal Grounding (VTG). To address this gap, we introduce Number-Prompt (NumPro), a novel method that empowers Vid-LLMs to bridge visual comprehension with temporal grounding by adding unique numerical identifiers to each video frame. Treating a video as a sequence of numbered frame images, NumPro transforms VTG into an intuitive process: flipping through manga panels in sequence. This allows Vid-LLMs to \"read\" event timelines, accurately linking visual content with corresponding temporal information. Our experiments demonstrate that NumPro significantly boosts VTG performance of top-tier Vid-LLMs without additional computational cost. Furthermore, fine-tuning on a NumPro-enhanced dataset defines a new state-of-the-art for VTG, surpassing previous top-performing methods by up to 6.9\\\\% in mIoU for moment retrieval and 8.5\\\\% in mAP for highlight detection. The code will be available at https://github.com/yongliang-wu/NumPro.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.10332', 'place': '', 'date': '2025-03-21', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.10332', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.10332', 'accessDate': '2025-07-22T03:39:16Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Number it', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.10332 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:39:16Z', 'dateModified': '2025-07-22T03:39:16Z'}\n",
      "{'key': 'E9A5MRVD', 'version': 5197, 'itemType': 'preprint', 'title': 'SemGeoMo: Dynamic Contextual Human Motion Generation with Semantic and Geometric Guidance', 'creators': [{'creatorType': 'author', 'firstName': 'Peishan', 'lastName': 'Cong'}, {'creatorType': 'author', 'firstName': 'Ziyi', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yuexin', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Xiangyu', 'lastName': 'Yue'}], 'abstractNote': 'Generating reasonable and high-quality human interactive motions in a given dynamic environment is crucial for understanding, modeling, transferring, and applying human behaviors to both virtual and physical robots. In this paper, we introduce an effective method, SemGeoMo, for dynamic contextual human motion generation, which fully leverages the text-affordance-joint multi-level semantic and geometric guidance in the generation process, improving the semantic rationality and geometric correctness of generative motions. Our method achieves state-of-the-art performance on three datasets and demonstrates superior generalization capability for diverse interaction scenarios. The project page and code can be found at https://4dvlab.github.io/project_page/semgeomo/.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2503.01291', 'place': '', 'date': '2025-03-03', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2503.01291', 'citationKey': '', 'url': 'http://arxiv.org/abs/2503.01291', 'accessDate': '2025-07-22T03:38:28Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'SemGeoMo', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2503.01291 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:38:28Z', 'dateModified': '2025-07-22T03:38:28Z'}\n",
      "{'key': 'PRRCJVEH', 'version': 5196, 'itemType': 'preprint', 'title': 'Fast3R: Towards 3D Reconstruction of 1000+ Images in One Forward Pass', 'creators': [{'creatorType': 'author', 'firstName': 'Jianing', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Alexander', 'lastName': 'Sax'}, {'creatorType': 'author', 'firstName': 'Kevin J.', 'lastName': 'Liang'}, {'creatorType': 'author', 'firstName': 'Mikael', 'lastName': 'Henaff'}, {'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Ang', 'lastName': 'Cao'}, {'creatorType': 'author', 'firstName': 'Joyce', 'lastName': 'Chai'}, {'creatorType': 'author', 'firstName': 'Franziska', 'lastName': 'Meier'}, {'creatorType': 'author', 'firstName': 'Matt', 'lastName': 'Feiszli'}], 'abstractNote': \"Multi-view 3D reconstruction remains a core challenge in computer vision, particularly in applications requiring accurate and scalable representations across diverse perspectives. Current leading methods such as DUSt3R employ a fundamentally pairwise approach, processing images in pairs and necessitating costly global alignment procedures to reconstruct from multiple views. In this work, we propose Fast 3D Reconstruction (Fast3R), a novel multi-view generalization to DUSt3R that achieves efficient and scalable 3D reconstruction by processing many views in parallel. Fast3R's Transformer-based architecture forwards N images in a single forward pass, bypassing the need for iterative alignment. Through extensive experiments on camera pose estimation and 3D reconstruction, Fast3R demonstrates state-of-the-art performance, with significant improvements in inference speed and reduced error accumulation. These results establish Fast3R as a robust alternative for multi-view applications, offering enhanced scalability without compromising reconstruction accuracy.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.13928', 'place': '', 'date': '2025-03-19', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.13928', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.13928', 'accessDate': '2025-07-22T03:37:54Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Fast3R', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.13928 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Graphics', 'type': 1}, {'tag': 'Computer Science - Robotics', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:37:54Z', 'dateModified': '2025-07-22T03:37:54Z'}\n",
      "{'key': 'A54QK4FW', 'version': 5195, 'itemType': 'preprint', 'title': 'StdGEN: Semantic-Decomposed 3D Character Generation from Single Images', 'creators': [{'creatorType': 'author', 'firstName': 'Yuze', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Yanning', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Wang', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Zhongkai', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Kaiwen', 'lastName': 'Xiao'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Yong-Jin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Han'}], 'abstractNote': 'We present StdGEN, an innovative pipeline for generating semantically decomposed high-quality 3D characters from single images, enabling broad applications in virtual reality, gaming, and filmmaking, etc. Unlike previous methods which struggle with limited decomposability, unsatisfactory quality, and long optimization times, StdGEN features decomposability, effectiveness and efficiency; i.e., it generates intricately detailed 3D characters with separated semantic components such as the body, clothes, and hair, in three minutes. At the core of StdGEN is our proposed Semantic-aware Large Reconstruction Model (S-LRM), a transformer-based generalizable model that jointly reconstructs geometry, color and semantics from multi-view images in a feed-forward manner. A differentiable multi-layer semantic surface extraction scheme is introduced to acquire meshes from hybrid implicit fields reconstructed by our S-LRM. Additionally, a specialized efficient multi-view diffusion model and an iterative multi-layer surface refinement module are integrated into the pipeline to facilitate high-quality, decomposable 3D character generation. Extensive experiments demonstrate our state-of-the-art performance in 3D anime character generation, surpassing existing baselines by a significant margin in geometry, texture and decomposability. StdGEN offers ready-to-use semantic-decomposed 3D characters and enables flexible customization for a wide range of applications. Project page: https://stdgen.github.io', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.05738', 'place': '', 'date': '2025-03-05', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.05738', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.05738', 'accessDate': '2025-07-22T03:37:03Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'StdGEN', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.05738 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:37:03Z', 'dateModified': '2025-07-22T03:37:03Z'}\n",
      "{'key': 'UK9PZLIZ', 'version': 5193, 'itemType': 'preprint', 'title': \"h-Edit: Effective and Flexible Diffusion-Based Editing via Doob's h-Transform\", 'creators': [{'creatorType': 'author', 'firstName': 'Toan', 'lastName': 'Nguyen'}, {'creatorType': 'author', 'firstName': 'Kien', 'lastName': 'Do'}, {'creatorType': 'author', 'firstName': 'Duc', 'lastName': 'Kieu'}, {'creatorType': 'author', 'firstName': 'Thin', 'lastName': 'Nguyen'}], 'abstractNote': 'We introduce a theoretical framework for diffusion-based image editing by formulating it as a reverse-time bridge modeling problem. This approach modifies the backward process of a pretrained diffusion model to construct a bridge that converges to an implicit distribution associated with the editing target at time 0. Building on this framework, we propose h-Edit, a novel editing method that utilizes Doob\\'s h-transform and Langevin Monte Carlo to decompose the update of an intermediate edited sample into two components: a \"reconstruction\" term and an \"editing\" term. This decomposition provides flexibility, allowing the reconstruction term to be computed via existing inversion techniques and enabling the combination of multiple editing terms to handle complex editing tasks. To our knowledge, h-Edit is the first training-free method capable of performing simultaneous text-guided and reward-model-based editing. Extensive experiments, both quantitative and qualitative, show that h-Edit outperforms state-of-the-art baselines in terms of editing effectiveness and faithfulness. Our source code is available at https://github.com/nktoan/h-edit.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2503.02187', 'place': '', 'date': '2025-03-04', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2503.02187', 'citationKey': '', 'url': 'http://arxiv.org/abs/2503.02187', 'accessDate': '2025-07-22T03:36:13Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'h-Edit', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2503.02187 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:36:13Z', 'dateModified': '2025-07-22T03:36:13Z'}\n",
      "{'key': 'SENH2EDE', 'version': 5192, 'itemType': 'preprint', 'title': 'Generative Gaussian Splatting for Unbounded 3D City Generation', 'creators': [{'creatorType': 'author', 'firstName': 'Haozhe', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Zhaoxi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Fangzhou', 'lastName': 'Hong'}, {'creatorType': 'author', 'firstName': 'Ziwei', 'lastName': 'Liu'}], 'abstractNote': '3D city generation with NeRF-based methods shows promising generation results but is computationally inefficient. Recently 3D Gaussian Splatting (3D-GS) has emerged as a highly efficient alternative for object-level 3D generation. However, adapting 3D-GS from finite-scale 3D objects and humans to infinite-scale 3D cities is non-trivial. Unbounded 3D city generation entails significant storage overhead (out-of-memory issues), arising from the need to expand points to billions, often demanding hundreds of Gigabytes of VRAM for a city scene spanning 10km^2. In this paper, we propose GaussianCity, a generative Gaussian Splatting framework dedicated to efficiently synthesizing unbounded 3D cities with a single feed-forward pass. Our key insights are two-fold: 1) Compact 3D Scene Representation: We introduce BEV-Point as a highly compact intermediate representation, ensuring that the growth in VRAM usage for unbounded scenes remains constant, thus enabling unbounded city generation. 2) Spatial-aware Gaussian Attribute Decoder: We present spatial-aware BEV-Point decoder to produce 3D Gaussian attributes, which leverages Point Serializer to integrate the structural and contextual characteristics of BEV points. Extensive experiments demonstrate that GaussianCity achieves state-of-the-art results in both drone-view and street-view 3D city generation. Notably, compared to CityDreamer, GaussianCity exhibits superior performance with a speedup of 60 times (10.72 FPS v.s. 0.18 FPS).', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2406.06526', 'place': '', 'date': '2025-02-27', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2406.06526', 'citationKey': '', 'url': 'http://arxiv.org/abs/2406.06526', 'accessDate': '2025-07-22T03:35:46Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2406.06526 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:35:46Z', 'dateModified': '2025-07-22T03:35:46Z'}\n",
      "{'key': 'DNGI6KII', 'version': 5184, 'itemType': 'preprint', 'title': 'Edit Away and My Face Will not Stay: Personal Biometric Defense against Malicious Generative Editing', 'creators': [{'creatorType': 'author', 'firstName': 'Hanhui', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yihua', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ruizheng', 'lastName': 'Bai'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Sijia', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Zhengzhong', 'lastName': 'Tu'}], 'abstractNote': 'Recent advancements in diffusion models have made generative image editing more accessible, enabling creative edits but raising ethical concerns, particularly regarding malicious edits to human portraits that threaten privacy and identity security. Existing protection methods primarily rely on adversarial perturbations to nullify edits but often fail against diverse editing requests. We propose FaceLock, a novel approach to portrait protection that optimizes adversarial perturbations to destroy or significantly alter biometric information, rendering edited outputs biometrically unrecognizable. FaceLock integrates facial recognition and visual perception into perturbation optimization to provide robust protection against various editing attempts. We also highlight flaws in commonly used evaluation metrics and reveal how they can be manipulated, emphasizing the need for reliable assessments of protection. Experiments show FaceLock outperforms baselines in defending against malicious edits and is robust against purification techniques. Ablation studies confirm its stability and broad applicability across diffusion-based editing algorithms. Our work advances biometric defense and sets the foundation for privacy-preserving practices in image editing. The code is available at: https://github.com/taco-group/FaceLock.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.16832', 'place': '', 'date': '2025-03-15', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.16832', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.16832', 'accessDate': '2025-07-22T03:33:45Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Edit Away and My Face Will not Stay', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.16832 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:33:45Z', 'dateModified': '2025-07-22T03:33:45Z'}\n",
      "{'key': 'GG2WIASG', 'version': 5180, 'itemType': 'preprint', 'title': 'AR-Diffusion: Asynchronous Video Generation with Auto-Regressive Diffusion', 'creators': [{'creatorType': 'author', 'firstName': 'Mingzhen', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Weining', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Gen', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jiawei', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jiahui', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Wanquan', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Shanshan', 'lastName': 'Lao'}, {'creatorType': 'author', 'firstName': 'SiYu', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Qian', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Liu'}], 'abstractNote': 'The task of video generation requires synthesizing visually realistic and temporally coherent video frames. Existing methods primarily use asynchronous auto-regressive models or synchronous diffusion models to address this challenge. However, asynchronous auto-regressive models often suffer from inconsistencies between training and inference, leading to issues such as error accumulation, while synchronous diffusion models are limited by their reliance on rigid sequence length. To address these issues, we introduce Auto-Regressive Diffusion (AR-Diffusion), a novel model that combines the strengths of auto-regressive and diffusion models for flexible, asynchronous video generation. Specifically, our approach leverages diffusion to gradually corrupt video frames in both training and inference, reducing the discrepancy between these phases. Inspired by auto-regressive generation, we incorporate a non-decreasing constraint on the corruption timesteps of individual frames, ensuring that earlier frames remain clearer than subsequent ones. This setup, together with temporal causal attention, enables flexible generation of videos with varying lengths while preserving temporal coherence. In addition, we design two specialized timestep schedulers: the FoPP scheduler for balanced timestep sampling during training, and the AD scheduler for flexible timestep differences during inference, supporting both synchronous and asynchronous generation. Extensive experiments demonstrate the superiority of our proposed method, which achieves competitive and state-of-the-art results across four challenging benchmarks.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2503.07418', 'place': '', 'date': '2025-03-10', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2503.07418', 'citationKey': '', 'url': 'http://arxiv.org/abs/2503.07418', 'accessDate': '2025-07-22T03:33:15Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'AR-Diffusion', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2503.07418 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:33:15Z', 'dateModified': '2025-07-22T03:33:15Z'}\n",
      "{'key': '26Y8YRFL', 'version': 5179, 'itemType': 'preprint', 'title': \"Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model\", 'creators': [{'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Shiwei', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xiaofeng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yujie', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Haonan', 'lastName': 'Qiu'}, {'creatorType': 'author', 'firstName': 'Yuzhong', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Yingya', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Qixiang', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Fang', 'lastName': 'Wan'}], 'abstractNote': 'As a fundamental backbone for video generation, diffusion models are challenged by low inference speed due to the sequential nature of denoising. Previous methods speed up the models by caching and reusing model outputs at uniformly selected timesteps. However, such a strategy neglects the fact that differences among model outputs are not uniform across timesteps, which hinders selecting the appropriate model outputs to cache, leading to a poor balance between inference efficiency and visual quality. In this study, we introduce Timestep Embedding Aware Cache (TeaCache), a training-free caching approach that estimates and leverages the fluctuating differences among model outputs across timesteps. Rather than directly using the time-consuming model outputs, TeaCache focuses on model inputs, which have a strong correlation with the modeloutputs while incurring negligible computational cost. TeaCache first modulates the noisy inputs using the timestep embeddings to ensure their differences better approximating those of model outputs. TeaCache then introduces a rescaling strategy to refine the estimated differences and utilizes them to indicate output caching. Experiments show that TeaCache achieves up to 4.41x acceleration over Open-Sora-Plan with negligible (-0.07% Vbench score) degradation of visual quality.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.19108', 'place': '', 'date': '2025-03-18', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.19108', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.19108', 'accessDate': '2025-07-22T03:32:55Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Timestep Embedding Tells', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.19108 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:32:55Z', 'dateModified': '2025-07-22T03:32:55Z'}\n",
      "{'key': '9ASVHKHV', 'version': 5177, 'itemType': 'preprint', 'title': 'PhyT2V: LLM-Guided Iterative Self-Refinement for Physics-Grounded Text-to-Video Generation', 'creators': [{'creatorType': 'author', 'firstName': 'Qiyao', 'lastName': 'Xue'}, {'creatorType': 'author', 'firstName': 'Xiangyu', 'lastName': 'Yin'}, {'creatorType': 'author', 'firstName': 'Boyuan', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Gao'}], 'abstractNote': \"Text-to-video (T2V) generation has been recently enabled by transformer-based diffusion models, but current T2V models lack capabilities in adhering to the real-world common knowledge and physical rules, due to their limited understanding of physical realism and deficiency in temporal modeling. Existing solutions are either data-driven or require extra model inputs, but cannot be generalizable to out-of-distribution domains. In this paper, we present PhyT2V, a new data-independent T2V technique that expands the current T2V model's capability of video generation to out-of-distribution domains, by enabling chain-of-thought and step-back reasoning in T2V prompting. Our experiments show that PhyT2V improves existing T2V models' adherence to real-world physical rules by 2.3x, and achieves 35% improvement compared to T2V prompt enhancers. The source codes are available at: https://github.com/pittisl/PhyT2V.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.00596', 'place': '', 'date': '2025-04-01', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.00596', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.00596', 'accessDate': '2025-07-22T03:25:31Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'PhyT2V', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.00596 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:25:31Z', 'dateModified': '2025-07-22T03:25:31Z'}\n",
      "{'key': 'PKDDV3GG', 'version': 5172, 'itemType': 'preprint', 'title': 'X-Dyna: Expressive Dynamic Human Image Animation', 'creators': [{'creatorType': 'author', 'firstName': 'Di', 'lastName': 'Chang'}, {'creatorType': 'author', 'firstName': 'Hongyi', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'You', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Yipeng', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Zhengfei', 'lastName': 'Kuang'}, {'creatorType': 'author', 'firstName': 'Shengqu', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Chenxu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Guoxian', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Chao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yichun', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Zeyuan', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shijie', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Linjie', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Gordon', 'lastName': 'Wetzstein'}, {'creatorType': 'author', 'firstName': 'Mohammad', 'lastName': 'Soleymani'}], 'abstractNote': 'We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video, that generates realistic, context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control, X-Dyna addresses key shortcomings causing the loss of dynamic details, enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter, a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control, we connect a local control module with our model to capture identity-disentangled facial expressions, facilitating accurate expression transfer for enhanced realism in animated scenes. Together, these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods, creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.10021', 'place': '', 'date': '2025-01-20', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.10021', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.10021', 'accessDate': '2025-07-22T03:24:18Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'X-Dyna', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.10021 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:24:18Z', 'dateModified': '2025-07-22T03:24:18Z'}\n",
      "{'key': 'RYY2GXFC', 'version': 5171, 'itemType': 'preprint', 'title': 'Cinemo: Consistent and Controllable Image Animation with Motion Diffusion Models', 'creators': [{'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Yaohui', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Gengyun', 'lastName': 'Jia'}, {'creatorType': 'author', 'firstName': 'Xinyuan', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yuan-Fang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Cunjian', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Qiao'}], 'abstractNote': 'Diffusion models have achieved great progress in image animation due to powerful generative capabilities. However, maintaining spatio-temporal consistency with detailed information from the input static image over time (e.g., style, background, and object of the input static image) and ensuring smoothness in animated video narratives guided by textual prompts still remains challenging. In this paper, we introduce Cinemo, a novel image animation approach towards achieving better motion controllability, as well as stronger temporal consistency and smoothness. In general, we propose three effective strategies at the training and inference stages of Cinemo to accomplish our goal. At the training stage, Cinemo focuses on learning the distribution of motion residuals, rather than directly predicting subsequent via a motion diffusion model. Additionally, a structural similarity index-based strategy is proposed to enable Cinemo to have better controllability of motion intensity. At the inference stage, a noise refinement technique based on discrete cosine transformation is introduced to mitigate sudden motion changes. Such three strategies enable Cinemo to produce highly consistent, smooth, and motion-controllable results. Compared to previous methods, Cinemo offers simpler and more precise user controllability. Extensive experiments against several state-of-the-art methods, including both commercial tools and research approaches, across multiple metrics, demonstrate the effectiveness and superiority of our proposed approach.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2407.15642', 'place': '', 'date': '2024-07-23', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2407.15642', 'citationKey': '', 'url': 'http://arxiv.org/abs/2407.15642', 'accessDate': '2025-07-22T03:24:03Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Cinemo', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2407.15642 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:24:03Z', 'dateModified': '2025-07-22T03:24:03Z'}\n",
      "{'key': 'I7JJBQS4', 'version': 5167, 'itemType': 'preprint', 'title': 'Identity-Preserving Text-to-Video Generation by Frequency Decomposition', 'creators': [{'creatorType': 'author', 'firstName': 'Shenghai', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Jinfa', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Xianyi', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Yunyuan', 'lastName': 'Ge'}, {'creatorType': 'author', 'firstName': 'Yujun', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Liuhan', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Jiebo', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Yuan'}], 'abstractNote': \"Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in literature: (1) A tuning-free pipeline without tedious case-by-case finetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based control scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V model to keep human identity consistent in the generated video. Inspired by prior findings in frequency analysis of diffusion transformers, it employs identity-control signals in the frequency domain, where facial features can be decomposed into low-frequency global features and high-frequency intrinsic features. First, from a low-frequency perspective, we introduce a global facial extractor, which encodes reference images and facial key points into a latent space, generating features enriched with low-frequency information. These features are then integrated into shallow layers of the network to alleviate training challenges associated with DiT. Second, from a high-frequency perspective, we design a local facial extractor to capture high-frequency details and inject them into transformer blocks, enhancing the model's ability to preserve fine-grained features. We propose a hierarchical training strategy to leverage frequency information for identity preservation, transforming a vanilla pre-trained video generation model into an IPT2V model. Extensive experiments demonstrate that our frequency-aware heuristic scheme provides an optimal control solution for DiT-based models. Thanks to this scheme, our ConsisID generates high-quality, identity-preserving videos, making strides towards more effective IPT2V. Code: https://github.com/PKU-YuanGroup/ConsisID.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.17440', 'place': '', 'date': '2025-03-25', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.17440', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.17440', 'accessDate': '2025-07-22T03:23:11Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.17440 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Multimedia', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:23:11Z', 'dateModified': '2025-07-22T03:23:11Z'}\n",
      "{'key': 'K9B8PPIR', 'version': 5163, 'itemType': 'preprint', 'title': 'Generative Photography: Scene-Consistent Camera Control for Realistic Text-to-Image Synthesis', 'creators': [{'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Xijun', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yichen', 'lastName': 'Sheng'}, {'creatorType': 'author', 'firstName': 'Prateek', 'lastName': 'Chennuri'}, {'creatorType': 'author', 'firstName': 'Xingguang', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Stanley', 'lastName': 'Chan'}], 'abstractNote': 'Image generation today can produce somewhat realistic images from text prompts. However, if one asks the generator to synthesize a specific camera setting such as creating different fields of view using a 24mm lens versus a 70mm lens, the generator will not be able to interpret and generate scene-consistent images. This limitation not only hinders the adoption of generative tools in professional photography but also highlights the broader challenge of aligning data-driven models with real-world physical settings. In this paper, we introduce Generative Photography, a framework that allows controlling camera intrinsic settings during content generation. The core innovation of this work are the concepts of Dimensionality Lifting and Differential Camera Intrinsics Learning, enabling smooth and consistent transitions across different camera settings. Experimental results show that our method produces significantly more scene-consistent photorealistic images than state-of-the-art models such as Stable Diffusion 3 and FLUX. Our code and additional results are available at https://generative-photography.github.io/project.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.02168', 'place': '', 'date': '2025-03-25', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.02168', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.02168', 'accessDate': '2025-07-22T03:22:18Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Generative Photography', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.02168 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:22:18Z', 'dateModified': '2025-07-22T03:22:18Z'}\n",
      "{'key': 'ETV3Y9AE', 'version': 5159, 'itemType': 'preprint', 'title': 'Parallelized Autoregressive Visual Generation', 'creators': [{'creatorType': 'author', 'firstName': 'Yuqing', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Shuhuai', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Zhijie', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Yujin', 'lastName': 'Han'}, {'creatorType': 'author', 'firstName': 'Haoyuan', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Zhenheng', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Difan', 'lastName': 'Zou'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Xihui', 'lastName': 'Liu'}], 'abstractNote': 'Autoregressive models have emerged as a powerful approach for visual generation but suffer from slow inference speed due to their sequential token-by-token prediction process. In this paper, we propose a simple yet effective approach for parallelized autoregressive visual generation that improves generation efficiency while preserving the advantages of autoregressive modeling. Our key insight is that parallel generation depends on visual token dependencies-tokens with weak dependencies can be generated in parallel, while strongly dependent adjacent tokens are difficult to generate together, as their independent sampling may lead to inconsistencies. Based on this observation, we develop a parallel generation strategy that generates distant tokens with weak dependencies in parallel while maintaining sequential generation for strongly dependent local tokens. Our approach can be seamlessly integrated into standard autoregressive models without modifying the architecture or tokenizer. Experiments on ImageNet and UCF-101 demonstrate that our method achieves a 3.6x speedup with comparable quality and up to 9.5x speedup with minimal quality degradation across both image and video generation tasks. We hope this work will inspire future research in efficient visual generation and unified autoregressive modeling. Project page: https://yuqingwang1029.github.io/PAR-project.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.15119', 'place': '', 'date': '2025-04-03', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.15119', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.15119', 'accessDate': '2025-07-22T03:19:01Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.15119 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:19:01Z', 'dateModified': '2025-07-22T03:19:01Z'}\n",
      "{'key': '4YR5RB63', 'version': 5155, 'itemType': 'preprint', 'title': 'TokenFlow: Unified Image Tokenizer for Multimodal Understanding and Generation', 'creators': [{'creatorType': 'author', 'firstName': 'Liao', 'lastName': 'Qu'}, {'creatorType': 'author', 'firstName': 'Huichao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yiheng', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Jiang'}, {'creatorType': 'author', 'firstName': 'Yiming', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Hu', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Daniel K.', 'lastName': 'Du'}, {'creatorType': 'author', 'firstName': 'Zehuan', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Xinglong', 'lastName': 'Wu'}], 'abstractNote': \"We present TokenFlow, a novel unified image tokenizer that bridges the long-standing gap between multimodal understanding and generation. Prior research attempt to employ a single reconstruction-targeted Vector Quantization (VQ) encoder for unifying these two tasks. We observe that understanding and generation require fundamentally different granularities of visual information. This leads to a critical trade-off, particularly compromising performance in multimodal understanding tasks. TokenFlow addresses this challenge through an innovative dual-codebook architecture that decouples semantic and pixel-level feature learning while maintaining their alignment via a shared mapping mechanism. This design enables direct access to both high-level semantic representations crucial for understanding tasks and fine-grained visual features essential for generation through shared indices. Our extensive experiments demonstrate TokenFlow's superiority across multiple dimensions. Leveraging TokenFlow, we demonstrate for the first time that discrete visual input can surpass LLaVA-1.5 13B in understanding performance, achieving a 7.2\\\\% average improvement. For image reconstruction, we achieve a strong FID score of 0.63 at 384*384 resolution. Moreover, TokenFlow establishes state-of-the-art performance in autoregressive image generation with a GenEval score of 0.55 at 256*256 resolution, achieving comparable results to SDXL.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.03069', 'place': '', 'date': '2024-12-04', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.03069', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.03069', 'accessDate': '2025-07-22T03:17:35Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'TokenFlow', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.03069 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:17:35Z', 'dateModified': '2025-07-22T03:17:35Z'}\n",
      "{'key': 'U6ITKL6L', 'version': 5152, 'itemType': 'preprint', 'title': 'SleeperMark: Towards Robust Watermark against Fine-Tuning Text-to-image Diffusion Models', 'creators': [{'creatorType': 'author', 'firstName': 'Zilan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Junfeng', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Jiacheng', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Yiming', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Heng', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Muhao', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhengzhong', 'lastName': 'Tu'}], 'abstractNote': \"Recent advances in large-scale text-to-image (T2I) diffusion models have enabled a variety of downstream applications, including style customization, subject-driven personalization, and conditional generation. As T2I models require extensive data and computational resources for training, they constitute highly valued intellectual property (IP) for their legitimate owners, yet making them incentive targets for unauthorized fine-tuning by adversaries seeking to leverage these models for customized, usually profitable applications. Existing IP protection methods for diffusion models generally involve embedding watermark patterns and then verifying ownership through generated outputs examination, or inspecting the model's feature space. However, these techniques are inherently ineffective in practical scenarios when the watermarked model undergoes fine-tuning, and the feature space is inaccessible during verification ((i.e., black-box setting). The model is prone to forgetting the previously learned watermark knowledge when it adapts to a new task. To address this challenge, we propose SleeperMark, a novel framework designed to embed resilient watermarks into T2I diffusion models. SleeperMark explicitly guides the model to disentangle the watermark information from the semantic concepts it learns, allowing the model to retain the embedded watermark while continuing to be adapted to new downstream tasks. Our extensive experiments demonstrate the effectiveness of SleeperMark across various types of diffusion models, including latent diffusion models (e.g., Stable Diffusion) and pixel diffusion models (e.g., DeepFloyd-IF), showing robustness against downstream fine-tuning and various attacks at both the image and model levels, with minimal impact on the model's generative capability. The code is available at https://github.com/taco-group/SleeperMark.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.04852', 'place': '', 'date': '2025-03-30', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.04852', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.04852', 'accessDate': '2025-07-22T03:16:47Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'SleeperMark', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.04852 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:16:47Z', 'dateModified': '2025-07-22T03:16:47Z'}\n",
      "{'key': '9TI9SUCU', 'version': 5149, 'itemType': 'preprint', 'title': 'Reconstruction vs. Generation: Taming Optimization Dilemma in Latent Diffusion Models', 'creators': [{'creatorType': 'author', 'firstName': 'Jingfeng', 'lastName': 'Yao'}, {'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Xinggang', 'lastName': 'Wang'}], 'abstractNote': 'Latent diffusion models with Transformer architectures excel at generating high-fidelity images. However, recent studies reveal an optimization dilemma in this two-stage design: while increasing the per-token feature dimension in visual tokenizers improves reconstruction quality, it requires substantially larger diffusion models and more training iterations to achieve comparable generation performance. Consequently, existing systems often settle for sub-optimal solutions, either producing visual artifacts due to information loss within tokenizers or failing to converge fully due to expensive computation costs. We argue that this dilemma stems from the inherent difficulty in learning unconstrained high-dimensional latent spaces. To address this, we propose aligning the latent space with pre-trained vision foundation models when training the visual tokenizers. Our proposed VA-VAE (Vision foundation model Aligned Variational AutoEncoder) significantly expands the reconstruction-generation frontier of latent diffusion models, enabling faster convergence of Diffusion Transformers (DiT) in high-dimensional latent spaces. To exploit the full potential of VA-VAE, we build an enhanced DiT baseline with improved training strategies and architecture designs, termed LightningDiT. The integrated system achieves state-of-the-art (SOTA) performance on ImageNet 256x256 generation with an FID score of 1.35 while demonstrating remarkable training efficiency by reaching an FID score of 2.11 in just 64 epochs--representing an over 21 times convergence speedup compared to the original DiT. Models and codes are available at: https://github.com/hustvl/LightningDiT.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.01423', 'place': '', 'date': '2025-03-10', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.01423', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.01423', 'accessDate': '2025-07-22T03:16:15Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Reconstruction vs. Generation', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.01423 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:16:15Z', 'dateModified': '2025-07-22T03:16:15Z'}\n",
      "{'key': 'MHNW3I2X', 'version': 5146, 'itemType': 'preprint', 'title': 'Auto-Encoded Supervision for Perceptual Image Super-Resolution', 'creators': [{'creatorType': 'author', 'firstName': 'MinKyu', 'lastName': 'Lee'}, {'creatorType': 'author', 'firstName': 'Sangeek', 'lastName': 'Hyun'}, {'creatorType': 'author', 'firstName': 'Woojin', 'lastName': 'Jun'}, {'creatorType': 'author', 'firstName': 'Jae-Pil', 'lastName': 'Heo'}], 'abstractNote': 'This work tackles the fidelity objective in the perceptual super-resolution~(SR). Specifically, we address the shortcomings of pixel-level $L_\\\\text{p}$ loss ($\\\\mathcal{L}_\\\\text{pix}$) in the GAN-based SR framework. Since $L_\\\\text{pix}$ is known to have a trade-off relationship against perceptual quality, prior methods often multiply a small scale factor or utilize low-pass filters. However, this work shows that these circumventions fail to address the fundamental factor that induces blurring. Accordingly, we focus on two points: 1) precisely discriminating the subcomponent of $L_\\\\text{pix}$ that contributes to blurring, and 2) only guiding based on the factor that is free from this trade-off relationship. We show that they can be achieved in a surprisingly simple manner, with an Auto-Encoder (AE) pretrained with $L_\\\\text{pix}$. Accordingly, we propose the Auto-Encoded Supervision for Optimal Penalization loss ($L_\\\\text{AESOP}$), a novel loss function that measures distance in the AE space, instead of the raw pixel space. Note that the AE space indicates the space after the decoder, not the bottleneck. By simply substituting $L_\\\\text{pix}$ with $L_\\\\text{AESOP}$, we can provide effective reconstruction guidance without compromising perceptual quality. Designed for simplicity, our method enables easy integration into existing SR frameworks. Experimental results verify that AESOP can lead to favorable results in the perceptual SR task.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.00124', 'place': '', 'date': '2025-04-11', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.00124', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.00124', 'accessDate': '2025-07-22T03:15:37Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.00124 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Electrical Engineering and Systems Science - Image and Video Processing', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:15:37Z', 'dateModified': '2025-07-22T03:15:37Z'}\n",
      "{'key': '6SJUKZ48', 'version': 5142, 'itemType': 'preprint', 'title': 'LiMoE: Mixture of LiDAR Representation Learners from Automotive Scenes', 'creators': [{'creatorType': 'author', 'firstName': 'Xiang', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Lingdong', 'lastName': 'Kong'}, {'creatorType': 'author', 'firstName': 'Hui', 'lastName': 'Shuai'}, {'creatorType': 'author', 'firstName': 'Liang', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Ziwei', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Qingshan', 'lastName': 'Liu'}], 'abstractNote': 'LiDAR data pretraining offers a promising approach to leveraging large-scale, readily available datasets for enhanced data utilization. However, existing methods predominantly focus on sparse voxel representation, overlooking the complementary attributes provided by other LiDAR representations. In this work, we propose LiMoE, a framework that integrates the Mixture of Experts (MoE) paradigm into LiDAR data representation learning to synergistically combine multiple representations, such as range images, sparse voxels, and raw points. Our approach consists of three stages: i) Image-to-LiDAR Pretraining, which transfers prior knowledge from images to point clouds across different representations; ii) Contrastive Mixture Learning (CML), which uses MoE to adaptively activate relevant attributes from each representation and distills these mixed features into a unified 3D network; iii) Semantic Mixture Supervision (SMS), which combines semantic logits from multiple representations to boost downstream segmentation performance. Extensive experiments across eleven large-scale LiDAR datasets demonstrate our effectiveness and superiority. The code has been made publicly accessible.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.04004', 'place': '', 'date': '2025-03-20', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.04004', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.04004', 'accessDate': '2025-07-22T03:10:46Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'LiMoE', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.04004 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}, {'tag': 'Computer Science - Robotics', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:10:46Z', 'dateModified': '2025-07-22T03:10:46Z'}\n",
      "{'key': 'BLTX79JQ', 'version': 5138, 'itemType': 'preprint', 'title': 'Omnidirectional Multi-Object Tracking', 'creators': [{'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Sheng', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Fei', 'lastName': 'Teng'}, {'creatorType': 'author', 'firstName': 'Mengfei', 'lastName': 'Duan'}, {'creatorType': 'author', 'firstName': 'Chang', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Yuhang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Kaiwei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Kailun', 'lastName': 'Yang'}], 'abstractNote': 'Panoramic imagery, with its 360{\\\\deg} field of view, offers comprehensive information to support Multi-Object Tracking (MOT) in capturing spatial and temporal relationships of surrounding objects. However, most MOT algorithms are tailored for pinhole images with limited views, impairing their effectiveness in panoramic settings. Additionally, panoramic image distortions, such as resolution loss, geometric deformation, and uneven lighting, hinder direct adaptation of existing MOT methods, leading to significant performance degradation. To address these challenges, we propose OmniTrack, an omnidirectional MOT framework that incorporates Tracklet Management to introduce temporal cues, FlexiTrack Instances for object localization and association, and the CircularStatE Module to alleviate image and geometric distortions. This integration enables tracking in panoramic field-of-view scenarios, even under rapid sensor motion. To mitigate the lack of panoramic MOT datasets, we introduce the QuadTrack dataset--a comprehensive panoramic dataset collected by a quadruped robot, featuring diverse challenges such as panoramic fields of view, intense motion, and complex environments. Extensive experiments on the public JRDB dataset and the newly introduced QuadTrack benchmark demonstrate the state-of-the-art performance of the proposed framework. OmniTrack achieves a HOTA score of 26.92% on JRDB, representing an improvement of 3.43%, and further achieves 23.45% on QuadTrack, surpassing the baseline by 6.81%. The established dataset and source code are available at https://github.com/xifen523/OmniTrack.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2503.04565', 'place': '', 'date': '2025-03-23', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2503.04565', 'citationKey': '', 'url': 'http://arxiv.org/abs/2503.04565', 'accessDate': '2025-07-22T03:09:37Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2503.04565 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Robotics', 'type': 1}, {'tag': 'Electrical Engineering and Systems Science - Image and Video Processing', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:09:37Z', 'dateModified': '2025-07-22T03:09:37Z'}\n",
      "{'key': 'ESCX4DDF', 'version': 5134, 'itemType': 'preprint', 'title': 'Multiple Object Tracking as ID Prediction', 'creators': [{'creatorType': 'author', 'firstName': 'Ruopeng', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Ji', 'lastName': 'Qi'}, {'creatorType': 'author', 'firstName': 'Limin', 'lastName': 'Wang'}], 'abstractNote': \"Multi-Object Tracking (MOT) has been a long-standing challenge in video understanding. A natural and intuitive approach is to split this task into two parts: object detection and association. Most mainstream methods employ meticulously crafted heuristic techniques to maintain trajectory information and compute cost matrices for object matching. Although these methods can achieve notable tracking performance, they often require a series of elaborate handcrafted modifications while facing complicated scenarios. We believe that manually assumed priors limit the method's adaptability and flexibility in learning optimal tracking capabilities from domain-specific data. Therefore, we introduce a new perspective that treats Multiple Object Tracking as an in-context ID Prediction task, transforming the aforementioned object association into an end-to-end trainable task. Based on this, we propose a simple yet effective method termed MOTIP. Given a set of trajectories carried with ID information, MOTIP directly decodes the ID labels for current detections to accomplish the association process. Without using tailored or sophisticated architectures, our method achieves state-of-the-art results across multiple benchmarks by solely leveraging object-level features as tracking cues. The simplicity and impressive results of MOTIP leave substantial room for future advancements, thereby making it a promising baseline for subsequent research. Our code and checkpoints are released at https://github.com/MCG-NJU/MOTIP.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2403.16848', 'place': '', 'date': '2025-03-24', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2403.16848', 'citationKey': '', 'url': 'http://arxiv.org/abs/2403.16848', 'accessDate': '2025-07-22T03:08:59Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2403.16848 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:08:59Z', 'dateModified': '2025-07-22T03:08:59Z'}\n",
      "{'key': '9HBBGZL2', 'version': 5127, 'itemType': 'preprint', 'title': 'LLMDet: Learning Strong Open-Vocabulary Object Detectors under the Supervision of Large Language Models', 'creators': [{'creatorType': 'author', 'firstName': 'Shenghao', 'lastName': 'Fu'}, {'creatorType': 'author', 'firstName': 'Qize', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Qijie', 'lastName': 'Mo'}, {'creatorType': 'author', 'firstName': 'Junkai', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Xihan', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Jingke', 'lastName': 'Meng'}, {'creatorType': 'author', 'firstName': 'Xiaohua', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Wei-Shi', 'lastName': 'Zheng'}], 'abstractNote': 'Recent open-vocabulary detectors achieve promising performance with abundant region-level annotated data. In this work, we show that an open-vocabulary detector co-training with a large language model by generating image-level detailed captions for each image can further improve performance. To achieve the goal, we first collect a dataset, GroundingCap-1M, wherein each image is accompanied by associated grounding labels and an image-level detailed caption. With this dataset, we finetune an open-vocabulary detector with training objectives including a standard grounding loss and a caption generation loss. We take advantage of a large language model to generate both region-level short captions for each region of interest and image-level long captions for the whole image. Under the supervision of the large language model, the resulting detector, LLMDet, outperforms the baseline by a clear margin, enjoying superior open-vocabulary ability. Further, we show that the improved LLMDet can in turn build a stronger large multi-modal model, achieving mutual benefits. The code, model, and dataset is available at https://github.com/iSEE-Laboratory/LLMDet.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.18954', 'place': '', 'date': '2025-01-31', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.18954', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.18954', 'accessDate': '2025-07-22T03:04:09Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'LLMDet', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.18954 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:04:09Z', 'dateModified': '2025-07-22T03:04:09Z'}\n",
      "{'key': 'HGJVPQLD', 'version': 5122, 'itemType': 'preprint', 'title': 'NLPrompt: Noise-Label Prompt Learning for Vision-Language Models', 'creators': [{'creatorType': 'author', 'firstName': 'Bikang', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Qun', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xiaoying', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Zhen', 'lastName': 'Fang'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jingya', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jingyi', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Ye', 'lastName': 'Shi'}], 'abstractNote': 'The emergence of vision-language foundation models, such as CLIP, has revolutionized image-text representation, enabling a broad range of applications via prompt learning. Despite its promise, real-world datasets often contain noisy labels that can degrade prompt learning performance. In this paper, we demonstrate that using mean absolute error (MAE) loss in prompt learning, named PromptMAE, significantly enhances robustness against noisy labels while maintaining high accuracy. Though MAE is straightforward and recognized for its robustness, it is rarely used in noisy-label learning due to its slow convergence and poor performance outside prompt learning scenarios. To elucidate the robustness of PromptMAE, we leverage feature learning theory to show that MAE can suppress the influence of noisy samples, thereby improving the signal-to-noise ratio and enhancing overall robustness. Additionally, we introduce PromptOT, a prompt-based optimal transport data purification method to enhance the robustness further. PromptOT employs text features in vision-language models as prototypes to construct an optimal transportation matrix. This matrix effectively partitions datasets into clean and noisy subsets, allowing for the application of cross-entropy loss to the clean subset and MAE loss to the noisy subset. Our Noise-Label Prompt Learning method, named NLPrompt, offers a simple and efficient approach that leverages the expressive representations and precise alignment capabilities of vision-language models for robust prompt learning. We validate NLPrompt through extensive experiments across various noise settings, demonstrating significant performance improvements.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.01256', 'place': '', 'date': '2025-03-26', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.01256', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.01256', 'accessDate': '2025-07-22T03:03:06Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'NLPrompt', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.01256 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T03:03:06Z', 'dateModified': '2025-07-22T03:03:06Z'}\n",
      "{'key': 'A5VFLGTF', 'version': 5118, 'itemType': 'preprint', 'title': 'RAP: Retrieval-Augmented Personalization for Multimodal Large Language Models', 'creators': [{'creatorType': 'author', 'firstName': 'Haoran', 'lastName': 'Hao'}, {'creatorType': 'author', 'firstName': 'Jiaming', 'lastName': 'Han'}, {'creatorType': 'author', 'firstName': 'Changsheng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yu-Feng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xiangyu', 'lastName': 'Yue'}], 'abstractNote': \"The development of large language models (LLMs) has significantly enhanced the capabilities of multimodal LLMs (MLLMs) as general assistants. However, lack of user-specific knowledge still restricts their application in human's daily life. In this paper, we introduce the Retrieval Augmented Personalization (RAP) framework for MLLMs' personalization. Starting from a general MLLM, we turn it into a personalized assistant in three steps. (a) Remember: We design a key-value database to store user-related information, e.g., user's name, avatar and other attributes. (b) Retrieve: When the user initiates a conversation, RAP will retrieve relevant information from the database using a multimodal retriever. (c) Generate: The input query and retrieved concepts' information are fed into MLLMs to generate personalized, knowledge-augmented responses. Unlike previous methods, RAP allows real-time concept editing via updating the external database. To further improve generation quality and alignment with user-specific information, we design a pipeline for data collection and create a specialized dataset for personalized training of MLLMs. Based on the dataset, we train a series of MLLMs as personalized multimodal assistants. By pretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual concepts without additional finetuning. Our models demonstrate outstanding flexibility and generation quality across a variety of tasks, such as personalized image captioning, question answering and visual recognition. The code, data and models are available at https://hoar012.github.io/RAP-Project/.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2410.13360', 'place': '', 'date': '2025-03-28', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2410.13360', 'citationKey': '', 'url': 'http://arxiv.org/abs/2410.13360', 'accessDate': '2025-07-22T02:53:01Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'RAP', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2410.13360 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computation and Language', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}, {'tag': 'Computer Science - Multimedia', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T02:53:01Z', 'dateModified': '2025-07-22T02:53:01Z'}\n",
      "{'key': 'S3WLFZEK', 'version': 5113, 'itemType': 'preprint', 'title': 'DynRefer: Delving into Region-level Multimodal Tasks via Dynamic Resolution', 'creators': [{'creatorType': 'author', 'firstName': 'Yuzhong', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Mingxiang', 'lastName': 'Liao'}, {'creatorType': 'author', 'firstName': 'Chen', 'lastName': 'Gong'}, {'creatorType': 'author', 'firstName': 'Qixiang', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Fang', 'lastName': 'Wan'}], 'abstractNote': 'One fundamental task of multimodal models is to translate referred image regions to human preferred language descriptions. Existing methods, however, ignore the resolution adaptability needs of different tasks, which hinders them to find out precise language descriptions. In this study, we propose a DynRefer approach, to pursue high-accuracy region-level referring through mimicking the resolution adaptability of human visual cognition. During training, DynRefer stochastically aligns language descriptions of multimodal tasks with images of multiple resolutions, which are constructed by nesting a set of random views around the referred region. During inference, DynRefer performs selectively multimodal referring by sampling proper region representations for tasks from the nested views based on image and task priors. This allows the visual information for referring to better match human preferences, thereby improving the representational adaptability of region-level multimodal models. Experiments show that DynRefer brings mutual improvement upon broad tasks including region-level captioning, open-vocabulary region recognition and attribute detection. Furthermore, DynRefer achieves state-of-the-art results on multiple region-level multimodal tasks using a single model. Code is available at https://github.com/callsys/DynRefer.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2405.16071', 'place': '', 'date': '2025-03-02', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2405.16071', 'citationKey': '', 'url': 'http://arxiv.org/abs/2405.16071', 'accessDate': '2025-07-22T02:22:54Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DynRefer', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2405.16071 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T02:22:55Z', 'dateModified': '2025-07-22T02:22:55Z'}\n",
      "{'key': 'NU9SG3BT', 'version': 5103, 'itemType': 'preprint', 'title': 'LSceneLLM: Enhancing Large 3D Scene Understanding Using Adaptive Visual Preferences', 'creators': [{'creatorType': 'author', 'firstName': 'Hongyan', 'lastName': 'Zhi'}, {'creatorType': 'author', 'firstName': 'Peihao', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Junyan', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Shuailei', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Xinyu', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Tianhang', 'lastName': 'Xiang'}, {'creatorType': 'author', 'firstName': 'Yinjie', 'lastName': 'Lei'}, {'creatorType': 'author', 'firstName': 'Mingkui', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Chuang', 'lastName': 'Gan'}], 'abstractNote': \"Research on 3D Vision-Language Models (3D-VLMs) is gaining increasing attention, which is crucial for developing embodied AI within 3D scenes, such as visual navigation and embodied question answering. Due to the high density of visual features, especially in large 3D scenes, accurately locating task-relevant visual information is challenging. Existing works attempt to segment all objects and consider their features as scene representations. However, these task-agnostic object features include much redundant information and missing details for the task-relevant area. To tackle these problems, we propose LSceneLLM, an adaptive framework that automatically identifies task-relevant areas by leveraging LLM's visual preference for different tasks, followed by a plug-and-play scene magnifier module to capture fine-grained details in focused areas. Specifically, a dense token selector examines the attention map of LLM to identify visual preferences for the instruction input. It then magnifies fine-grained details of the focusing area. An adaptive self-attention module is leveraged to fuse the coarse-grained and selected fine-grained visual information. To comprehensively evaluate the large scene understanding ability of 3D-VLMs, we further introduce a cross-room understanding benchmark, XR-Scene, which contains a series of large scene understanding tasks including XR-QA, XR-EmbodiedPlanning, and XR-SceneCaption. Experiments show that our method surpasses existing methods on both large scene understanding and existing scene understanding benchmarks. Plunging our scene magnifier module into the existing 3D-VLMs also brings significant improvement.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.01292', 'place': '', 'date': '2025-02-02', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.01292', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.01292', 'accessDate': '2025-07-22T01:48:28Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'LSceneLLM', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.01292 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T01:48:28Z', 'dateModified': '2025-07-22T01:48:28Z'}\n",
      "{'key': 'ECSNBIZD', 'version': 5099, 'itemType': 'preprint', 'title': 'CityWalker: Learning Embodied Urban Navigation from Web-Scale Videos', 'creators': [{'creatorType': 'author', 'firstName': 'Xinhao', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jintong', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yicheng', 'lastName': 'Jiang'}, {'creatorType': 'author', 'firstName': 'Niranjan', 'lastName': 'Sujay'}, {'creatorType': 'author', 'firstName': 'Zhicheng', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Juexiao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'John', 'lastName': 'Abanes'}, {'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Chen', 'lastName': 'Feng'}], 'abstractNote': 'Navigating dynamic urban environments presents significant challenges for embodied agents, requiring advanced spatial reasoning and adherence to common-sense norms. Despite progress, existing visual navigation methods struggle in map-free or off-street settings, limiting the deployment of autonomous agents like last-mile delivery robots. To overcome these obstacles, we propose a scalable, data-driven approach for human-like urban navigation by training agents on thousands of hours of in-the-wild city walking and driving videos sourced from the web. We introduce a simple and scalable data processing pipeline that extracts action supervision from these videos, enabling large-scale imitation learning without costly annotations. Our model learns sophisticated navigation policies to handle diverse challenges and critical scenarios. Experimental results show that training on large-scale, diverse datasets significantly enhances navigation performance, surpassing current methods. This work shows the potential of using abundant online video data to develop robust navigation policies for embodied agents in dynamic urban settings. Project homepage is at https://ai4ce.github.io/CityWalker/.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.17820', 'place': '', 'date': '2025-04-22', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.17820', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.17820', 'accessDate': '2025-07-22T01:46:11Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'CityWalker', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.17820 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Robotics', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T01:46:11Z', 'dateModified': '2025-07-22T01:46:11Z'}\n",
      "{'key': '4VBHP4C5', 'version': 5095, 'itemType': 'preprint', 'title': 'MambaVision: A Hybrid Mamba-Transformer Vision Backbone', 'creators': [{'creatorType': 'author', 'firstName': 'Ali', 'lastName': 'Hatamizadeh'}, {'creatorType': 'author', 'firstName': 'Jan', 'lastName': 'Kautz'}], 'abstractNote': 'We propose a novel hybrid Mamba-Transformer backbone, MambaVision, specifically tailored for vision applications. Our core contribution includes redesigning the Mamba formulation to enhance its capability for efficient modeling of visual features. Through a comprehensive ablation study, we demonstrate the feasibility of integrating Vision Transformers (ViT) with Mamba. Our results show that equipping the Mamba architecture with self-attention blocks in the final layers greatly improves its capacity to capture long-range spatial dependencies. Based on these findings, we introduce a family of MambaVision models with a hierarchical architecture to meet various design criteria. For classification on the ImageNet-1K dataset, MambaVision variants achieve state-of-the-art (SOTA) performance in terms of both Top-1 accuracy and throughput. In downstream tasks such as object detection, instance segmentation, and semantic segmentation on MS COCO and ADE20K datasets, MambaVision outperforms comparably sized backbones while demonstrating favorable performance. Code: https://github.com/NVlabs/MambaVision', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2407.08083', 'place': '', 'date': '2025-03-25', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2407.08083', 'citationKey': '', 'url': 'http://arxiv.org/abs/2407.08083', 'accessDate': '2025-07-22T01:43:41Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'MambaVision', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2407.08083 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-07-22T01:43:41Z', 'dateModified': '2025-07-22T01:43:41Z'}\n",
      "{'key': 'XTKHQYXQ', 'version': 5000, 'itemType': 'journalArticle', 'title': 'A novel framework on intelligent detection for module defects of PV plant combining the visible and infrared images一种结合可见光和红外图像的光伏电站组件缺陷智能检测新框架', 'creators': [{'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Hong'}, {'creatorType': 'author', 'firstName': 'Jie', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Hang', 'lastName': 'Meng'}, {'creatorType': 'author', 'firstName': 'Rui', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Fang', 'lastName': 'Fang'}, {'creatorType': 'author', 'firstName': 'Guangming', 'lastName': 'Zhang'}], 'abstractNote': 'Solar Photovoltaic (PV) industry has achieved rapid development in recent years. However, it is difficult and costly to detect the micro fault area in a large PV power plant due to environmental factors and missing data. Most faults can be detected by the infrared temperature measurement method, but the infrared camera characteristics constrain it. This paper proposed a novel framework, consisting of image acquirement, image segmentation, fault orientation and defect warning, to remedy the limitations for PV module defects. The visible and infrared PV array images are taken under the same conditions by a dual infrared camera at low altitudes. The deep learning methods, including the fifth version of You Only Look Once (YOLOv5) algorithm and Deep Residual Network (ResNet) algorithm, are introduced to this framework. Hence, this framework has strong capability to suit almost all brightness conditions, by the combination of image segmentation from visible images and fault location on infrared images. The results show that this framework dramatically improves the separation speed of photovoltaic array to 36 Fps and the accuracy of fault detection to 95% by infrared image marked with the segmented area.', 'publicationTitle': 'Solar Energy', 'volume': '236', 'issue': '', 'pages': '406-416', 'date': '2022-04-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Solar Energy', 'language': '', 'DOI': '10.1016/j.solener.2022.03.018', 'ISSN': '0038-092X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X22001840', 'accessDate': '2025-07-10T08:21:38Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Image fusion', 'type': 1}, {'tag': 'Module defect', 'type': 1}, {'tag': 'PV plant', 'type': 1}, {'tag': 'ResNet', 'type': 1}, {'tag': 'YOLOv5', 'type': 1}], 'collections': ['NGZJGTG9'], 'relations': {}, 'dateAdded': '2025-07-10T08:21:38Z', 'dateModified': '2025-07-10T08:21:38Z'}\n",
      "{'key': '52TYGAL4', 'version': 4996, 'itemType': 'journalArticle', 'title': 'Automatic detection of photovoltaic module defects in infrared images with isolated and develop-model transfer deep learning通过隔离和开发模型迁移深度学习自动检测红外图像中的光伏组件缺陷', 'creators': [{'creatorType': 'author', 'firstName': 'M. Waqar', 'lastName': 'Akram'}, {'creatorType': 'author', 'firstName': 'Guiqiang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Changan', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Ashfaq', 'lastName': 'Ahmad'}], 'abstractNote': 'With the rising use of photovoltaic and ongoing installation of large-scale photovoltaic systems worldwide, the automation of photovoltaic monitoring methods becomes important, as manual/visual inspection has limited applications. This research work deals with automatic detection of photovoltaic module defects in Infrared images with isolated deep learning and develop-model transfer deep learning techniques. An Infrared images dataset containing infrared images of normal operating and defective modules is collected and used to train the networks. The dataset is obtained from Infrared imaging performed on normal operating and defective photovoltaic modules with lab induced defects. An isolated learned model is trained from scratch using a light convolutional neural network design that achieved an average accuracy of 98.67%. For transfer learning, a base model is first developed (pre-trained) from electroluminescence images dataset of photovoltaic cells and then fine-tuned on infrared images dataset, that achieved an average accuracy of 99.23%. Both frameworks require low computation power and less time; and can be implemented with ordinary hardware. They also maintained real time prediction speed. The comparison shows that the develop-model transfer learning technique can help to improve the performance. In addition, we reviewed different kind of defects detectable from infrared imaging of photovoltaic modules, that can help in manual labelling for identifying different defect categories upon access to new huge data in future studies. Last of all, the presented frameworks are applied for experimental testing and qualitative evaluation.', 'publicationTitle': 'Solar Energy', 'volume': '198', 'issue': '', 'pages': '175-186', 'date': '2020-03-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Solar Energy', 'language': '', 'DOI': '10.1016/j.solener.2020.01.055', 'ISSN': '0038-092X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X20300621', 'accessDate': '2025-07-10T08:21:16Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Automatic defect detection', 'type': 1}, {'tag': 'Develop-model transfer deep learning', 'type': 1}, {'tag': 'Infrared images', 'type': 1}, {'tag': 'Isolated deep learning', 'type': 1}, {'tag': 'Photovoltaic (PV) modules', 'type': 1}, {'tag': 'Thermography', 'type': 1}], 'collections': ['NGZJGTG9'], 'relations': {}, 'dateAdded': '2025-07-10T08:21:16Z', 'dateModified': '2025-07-10T08:21:18Z'}\n",
      "{'key': 'N48QCALG', 'version': 4973, 'itemType': 'conferencePaper', 'title': 'A deep learning based approach for detecting panels in photovoltaic plants', 'creators': [{'creatorType': 'author', 'firstName': 'Antonio', 'lastName': 'Greco'}, {'creatorType': 'author', 'firstName': 'Christopher', 'lastName': 'Pironti'}, {'creatorType': 'author', 'firstName': 'Alessia', 'lastName': 'Saggese'}, {'creatorType': 'author', 'firstName': 'Mario', 'lastName': 'Vento'}, {'creatorType': 'author', 'firstName': 'Vincenzo', 'lastName': 'Vigilante'}], 'abstractNote': 'Photovoltaic (PV) panels are a clean and widespread way to produce renewable energy from sunlight; at the same time, such plants require maintenance, since solar panels can be affected by many types of damaging factors and have a limited yet variable lifespan. With the impressive growth of such PV installations, it is in the public eye the need of a cheap and effective way to continuously monitor the state of the plants and a standard technique designed to promptly replace broken modules, in order to prevent drops in the energy production. Since the faults mainly appear as Hot Spots on the surface of the PV panels, aerial thermal imaging can be used to diagnose such problems and also locate them in huge plants. To this aim, dedicated automatic Computer Vision methods are able to automatically find hot spots from thermal images, where they appear as white stains. In these methods a fundamental step is the segmentation of the PV panels, which allows to automatically detect each module.', 'date': '2020-01-07', 'proceedingsTitle': 'Proceedings of the 3rd International Conference on Applications of Intelligent Systems', 'conferenceName': 'APPIS 2020: 3rd International Conference on Applications of Intelligent Systems', 'place': 'Las Palmas de Gran Canaria Spain', 'publisher': 'ACM', 'volume': '', 'pages': '1-7', 'series': '', 'language': 'en', 'DOI': '10.1145/3378184.3378185', 'ISBN': '978-1-4503-7630-3', 'shortTitle': '', 'url': 'https://dl.acm.org/doi/10.1145/3378184.3378185', 'accessDate': '2025-06-24T02:58:35Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-06-24T02:58:35Z', 'dateModified': '2025-06-24T02:58:35Z'}\n",
      "{'key': 'DDSN7V7W', 'version': 5030, 'itemType': 'journalArticle', 'title': 'DAF-DETR: A dynamic adaptation feature transformer for enhanced object detection in unmanned aerial vehicles', 'creators': [{'creatorType': 'author', 'firstName': 'Baoye', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Shihao', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Zidong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Weibo', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiaohui', 'lastName': 'Liu'}], 'abstractNote': 'Object detection in complex environments is challenged by overlapping objects, complex spatial relationships, and dynamic variations in target scales. To address these challenges, the Dynamic Adaptation Feature DEtection TRansformer (DAF-DETR) is proposed as a novel transformer-based model optimized for real-time detection in spatially complex environments. The framework introduces four key innovations. First, a learnable position encoding mechanism is employed in place of fixed positional encoding, enhancing adaptability and flexibility when processing complex spatial layouts. Second, the Resynthetic Network (ResynNet) backbone, which consists of stacked Resynthetic Blocks (ResynBlocks) integrating ResBlock and FasterBlock feature extraction strategies, is designed to optimize multi-scale feature representation and improve computational efficiency. Third, an enhanced feature fusion module is incorporated to strengthen the detection of small, densely packed objects by integrating multi-scale contextual information. Fourth, a dynamic perception module is introduced, utilizing deformable attention to capture complex spatial relationships between overlapping objects. Extensive experiments conducted on the Vision meets Drone 2019 (VisDrone2019) and Tiny Object Detection in Aerial Images (AI-TOD) datasets demonstrate the superiority of DAF-DETR, achieving state-of-the-art detection accuracy while maintaining real-time efficiency. The results confirm its robustness in handling scale variations, occlusions, and spatial complexity, establishing it as a reliable solution for real-world applications such as aerial imagery and crowded scene analysis.', 'publicationTitle': 'Knowledge-Based Systems', 'volume': '323', 'issue': '', 'pages': '113760', 'date': '07/2025', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Knowledge-Based Systems', 'language': 'en', 'DOI': '10.1016/j.knosys.2025.113760', 'ISSN': '09507051', 'shortTitle': 'DAF-DETR', 'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0950705125008068', 'accessDate': '2025-06-16T08:34:29Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-06-16T08:34:29Z', 'dateModified': '2025-06-16T08:34:30Z'}\n",
      "{'key': 'KPGFDMBP', 'version': 5083, 'itemType': 'preprint', 'title': 'Rethinking Transformer-Based Blind-Spot Network for Self-Supervised Image Denoising', 'creators': [{'creatorType': 'author', 'firstName': 'Junyi', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Zhilu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Wangmeng', 'lastName': 'Zuo'}], 'abstractNote': 'Blind-spot networks (BSN) have been prevalent neural architectures in self-supervised image denoising (SSID). However, most existing BSNs are conducted with convolution layers. Although transformers have shown the potential to overcome the limitations of convolutions in many image restoration tasks, the attention mechanisms may violate the blind-spot requirement, thereby restricting their applicability in BSN. To this end, we propose to analyze and redesign the channel and spatial attentions to meet the blind-spot requirement. Specifically, channel self-attention may leak the blind-spot information in multi-scale architectures, since the downsampling shuffles the spatial feature into channel dimensions. To alleviate this problem, we divide the channel into several groups and perform channel attention separately. For spatial selfattention, we apply an elaborate mask to the attention matrix to restrict and mimic the receptive field of dilated convolution. Based on the redesigned channel and window attentions, we build a Transformer-based Blind-Spot Network (TBSN), which shows strong local fitting and global perspective abilities. Furthermore, we introduce a knowledge distillation strategy that distills TBSN into smaller denoisers to improve computational efficiency while maintaining performance. Extensive experiments on real-world image denoising datasets show that TBSN largely extends the receptive field and exhibits favorable performance against state-of-theart SSID methods.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2404.07846', 'place': '', 'date': '2024-12-17', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2404.07846', 'citationKey': '', 'url': 'http://arxiv.org/abs/2404.07846', 'accessDate': '2025-06-16T08:01:21Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2404.07846 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Electrical Engineering and Systems Science - Image and Video Processing', 'type': 1}], 'collections': ['49A7H9UF'], 'relations': {}, 'dateAdded': '2025-06-16T08:01:21Z', 'dateModified': '2025-06-16T08:01:21Z'}\n",
      "{'key': 'PYFKLHRA', 'version': 4958, 'itemType': 'journalArticle', 'title': 'Fast fault detection method for photovoltaic arrays with adaptive deep multiscale feature enhancement', 'creators': [{'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Gong'}, {'creatorType': 'author', 'firstName': 'Aimin', 'lastName': 'An'}, {'creatorType': 'author', 'firstName': 'Yaoke', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Xuemin', 'lastName': 'Zhang'}], 'abstractNote': 'Photovoltaic (PV) arrays have output characteristics such as randomness and intermittency, and faults can seriously affect the safe operation of the power system. In order to improve the comprehensive performance of the PV array fault diagnosis model, a new intelligent online fault monitoring method for PV arrays is proposed in this paper. (1) a three-dimensional channel feature map based on I, V, and P features is constructed because the IV and P curves of the PV array have significantly different effects under different fault conditions. (2) The PV array fault diagnosis model based on a multi-source information fusion network (MIFNet) is proposed, and Channel Mixing Convolution (CMC) module, three-dimensional feature attention enhancement (TDFAE) module, and Channel normalized scaling (CNS) module are designed to improve the comprehensive performance of the model. (3) An adaptive nonlinear mutual sparrow search algorithm (ANMSSA) is proposed to optimize the hyperparameter configuration of the MIFNet network. The experimental results show that the average recognition accuracy, prediction accuracy, and sensitivity of the ANMSSA-MIFNet network proposed in this paper are 99.64%, 99.64%, and 99.71% respectively. When facing single-component faults and multi-component faults, the model has stronger diagnostic accuracy, robustness, anti-noise ability, and stability, and can efficiently diagnose different faults of PV arrays, providing the scientific basis and theoretical support for the operation of PV systems.', 'publicationTitle': 'Applied Energy', 'volume': '353', 'issue': '', 'pages': '122071', 'date': '01/2024', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': 'en', 'DOI': '10.1016/j.apenergy.2023.122071', 'ISSN': '03062619', 'shortTitle': '', 'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0306261923014356', 'accessDate': '2025-06-11T08:34:51Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2025-06-11T08:34:51Z', 'dateModified': '2025-06-11T08:34:51Z'}\n",
      "{'key': '4JPSLBSX', 'version': 4946, 'itemType': 'journalArticle', 'title': 'Photovoltaic Cell Defect Detection by Lock-In Thermography Using 2-D Gaussian Profile', 'creators': [{'creatorType': 'author', 'firstName': 'Thiago Mota', 'lastName': 'Vieira'}, {'creatorType': 'author', 'firstName': 'Ézio C.', 'lastName': 'Santana'}, {'creatorType': 'author', 'firstName': 'Tarso V.', 'lastName': 'Ferreira'}, {'creatorType': 'author', 'firstName': 'Douglas B.', 'lastName': 'Riffel'}], 'abstractNote': 'The electrical energy produced by photovoltaic systems can be critically affected by a variety of factors. In order to detect defective photovoltaic cells, several monitoring techniques, such as lock-in thermography, have been widely used alongside some analytical methods that avoid subjectivity. This article proposes a method with low computational cost that provides a simple and easily implementable way to quantifiably discern if a photovoltaic cell is defective or not. A two-dimensional Gaussian fit is applied to images generated by fast Fourier transform and principal component analysis algorithms on thermographic data from lock-in thermography tests. The considered coefficient of determination R^2 was found to be a good measure of fitting quality. Additionally, the method highlighted the potential of its application on first principal component, with R^2 between 0.944 and 0.986, and magnitude images, with R^2 between 0.965 and 0.985, in order to identify and distinguish nondefective cells from defective ones.', 'publicationTitle': 'IEEE Journal of Photovoltaics', 'volume': '14', 'issue': '3', 'pages': '480-487', 'date': '2024-05', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/JPHOTOV.2024.3362134', 'ISSN': '2156-3403', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/10438520', 'accessDate': '2025-06-10T06:08:59Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Coefficient of determination', 'type': 1}, {'tag': 'Fast Fourier transforms', 'type': 1}, {'tag': 'Gaussian processes', 'type': 1}, {'tag': 'Photovoltaic cells', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Principal component analysis', 'type': 1}, {'tag': 'Surface cracks', 'type': 1}, {'tag': 'lock-in thermography (LIT)', 'type': 1}, {'tag': 'photovoltaic (PV) cell', 'type': 1}, {'tag': 'two-dimensional (2-D) Gaussian fit', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-10T06:08:59Z', 'dateModified': '2025-06-10T06:08:59Z'}\n",
      "{'key': 'J49FANMG', 'version': 4941, 'itemType': 'journalArticle', 'title': 'Faults Detection for Photovoltaic Field Based on K-Means, Elbow, and Average Silhouette Techniques through the Segmentation of a Thermal Image', 'creators': [{'creatorType': 'author', 'firstName': 'Abdelilah', 'lastName': 'Et-taleby'}, {'creatorType': 'author', 'firstName': 'Mohammed', 'lastName': 'Boussetta'}, {'creatorType': 'author', 'firstName': 'Mohamed', 'lastName': 'Benslimane'}], 'abstractNote': 'Clustering or grouping is among the most important image processing methods that aim to split an image into different groups. Examining the literature, many clustering algorithms have been carried ou...', 'publicationTitle': 'International Journal of Photoenergy', 'volume': '2020', 'issue': '1', 'pages': '6617597', 'date': '2020/01/01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.1155/2020/6617597', 'ISSN': '1687-529X', 'shortTitle': '', 'url': 'https://onlinelibrary.wiley.com/doi/10.1155/2020/6617597', 'accessDate': '2025-06-10T05:41:23Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'onlinelibrary.wiley.com', 'callNumber': '', 'rights': '', 'extra': 'Publisher: John Wiley & Sons, Ltd', 'tags': [], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-10T05:41:23Z', 'dateModified': '2025-06-10T05:41:23Z'}\n",
      "{'key': 'GHLMEJKX', 'version': 4936, 'itemType': 'journalArticle', 'title': 'Thermal Infrared and Visual Inspection of Photovoltaic Installations by UAV Photogrammetry—Application Case: Morocco', 'creators': [{'creatorType': 'author', 'firstName': 'Yahya', 'lastName': 'Zefri'}, {'creatorType': 'author', 'firstName': 'Achraf', 'lastName': 'ElKettani'}, {'creatorType': 'author', 'firstName': 'Imane', 'lastName': 'Sebari'}, {'creatorType': 'author', 'firstName': 'Sara', 'lastName': 'Ait Lamallam'}], 'abstractNote': 'Being sustainable, clean, and eco-friendly, photovoltaic technology is considered as one of the most hoped solutions face to worldwide energetic challenges. Morocco joins this context with the inauguration of numerous clean energy projects. However, one key factor in making photovoltaic installations a profitable investment are regular and effective inspections in order to detect occurred defects. Unmanned aerial vehicles (UAV) are increasingly used in various inspection fields. In this respect, this work focuses on the use of thermal and visual imagery taken by UAV in the inspection of photovoltaic installations. Visual and thermal images of photovoltaic modules, obtained by UAV, from different installations, and with different acquisition conditions and parameters, were exploited to generate orthomosaics for inspection purposes. The methodology was tested on a dataset we have acquired by a mission in Rabat (Morocco), and also on external datasets acquired in Switzerland. As final results, several visual defects were detected in visual RGB and thermal orthomosaics, such as cracks, soiling, and hotspots. In addition, a procedure of semi-automatic hotspots’ extraction was also developed and is presented within this work. On the other side, various tests were conducted on the influence of some acquisition and processing parameters (images’ overlap, the ground sampling distance, the flying height, the use of ground control points, the internal camera parameters’ optimization) on the detection of defects and the quality of visual and thermal generated orthomosaics. In the end, the potential of UAV thermal and visual imagery in the inspection of photovoltaic installations was discussed in function of various parameters. On the basis of the discussion feedback, UAV were concluded as advantageous tools within the thematic of this project, which proves the necessity of their implementation in this context.', 'publicationTitle': 'Drones', 'volume': '2', 'issue': '4', 'pages': '41', 'date': '2018/12', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3390/drones2040041', 'ISSN': '2504-446X', 'shortTitle': 'Thermal Infrared and Visual Inspection of Photovoltaic Installations by UAV Photogrammetry—Application Case', 'url': 'https://www.mdpi.com/2504-446X/2/4/41', 'accessDate': '2025-06-10T03:45:27Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.mdpi.com', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/by/3.0/', 'extra': 'Number: 4\\nPublisher: Multidisciplinary Digital Publishing Institute', 'tags': [{'tag': 'UAV', 'type': 1}, {'tag': 'automatic detection', 'type': 1}, {'tag': 'defects', 'type': 1}, {'tag': 'photovoltaic installation', 'type': 1}, {'tag': 'thermal infrared inspection', 'type': 1}, {'tag': 'visual inspection', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-10T03:45:27Z', 'dateModified': '2025-06-10T03:45:27Z'}\n",
      "{'key': '6IL8UZAM', 'version': 4921, 'itemType': 'journalArticle', 'title': 'Automatic supervision and fault detection of PV systems based on power losses analysis', 'creators': [{'creatorType': 'author', 'firstName': 'A.', 'lastName': 'Chouder'}, {'creatorType': 'author', 'firstName': 'S.', 'lastName': 'Silvestre'}], 'abstractNote': 'In this work, we present a new automatic supervision and fault detection procedure for PV systems, based on the power losses analysis. This automatic supervision system has been developed in Matlab&Simulink environment. It includes parameter extraction techniques to calculate main PV system parameters from monitoring data in real conditions of work, taking into account the environmental irradiance and module temperature evolution, allowing simulation of the PV system behaviour in real time. The automatic supervision method analyses the output power losses, presents in the DC side of the PV generator, capture losses. Two new power losses indicators are deﬁned: thermal capture losses (Lct) and miscellaneous capture losses (Lcm). The processing of these indicators allows the supervision system to generate a faulty signal as indicator of fault detection in the PV system operation.', 'publicationTitle': 'Energy Conversion and Management', 'volume': '51', 'issue': '10', 'pages': '1929-1937', 'date': '10/2010', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Energy Conversion and Management', 'language': 'en', 'DOI': '10.1016/j.enconman.2010.02.025', 'ISSN': '01968904', 'shortTitle': '', 'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0196890410000919', 'accessDate': '2025-06-10T02:40:56Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://www.elsevier.com/tdm/userlicense/1.0/', 'extra': '', 'tags': [], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-10T02:40:56Z', 'dateModified': '2025-06-10T02:40:56Z'}\n",
      "{'key': 'K7GY4NZW', 'version': 4911, 'itemType': 'journalArticle', 'title': 'A comprehensive study on different types of faults and detection techniques for solar photovoltaic system', 'creators': [{'creatorType': 'author', 'firstName': 'Siva Ramakrishna', 'lastName': 'Madeti'}, {'creatorType': 'author', 'firstName': 'S. N.', 'lastName': 'Singh'}], 'abstractNote': 'Monitoring systems are essential to maintain optimal performance of photovoltaic (PV) systems. A critical aspect in such monitoring systems is the fault diagnosis technique being used. The role of a fault detection and diagnosis technique is to identify the causes affecting the real-time energy production and/or smooth functioning of PV systems. Over the past decade, various fault detection methods were reported in literature. Among all the fault detection techniques reported, some paid significant attention only on faults that occur in the PV system, some on faults on DC side of the PV system while the rest focused on AC side faults. For the first time, this paper provides a comprehensive review of popular fault detection techniques, addressing all major types of faults in PV systems. Detailed insights of PV fault detection techniques along with their relative performances are covered. A new fault detection technique is also proposed to identify the type and location (module level) of a fault. This review enables the reader to get acquaintance with major aspects/considerations in developing/choosing an effective yet viable fault detection technique for small and medium scale PV systems.', 'publicationTitle': 'Solar Energy', 'volume': '158', 'issue': '', 'pages': '161-185', 'date': '2017-12-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Solar Energy', 'language': '', 'DOI': '10.1016/j.solener.2017.08.069', 'ISSN': '0038-092X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X17307508', 'accessDate': '2025-06-10T01:48:42Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Different faults', 'type': 1}, {'tag': 'Fault detection techniques', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-10T01:48:42Z', 'dateModified': '2025-06-10T01:48:42Z'}\n",
      "{'key': 'VHECFUDS', 'version': 4905, 'itemType': 'conferencePaper', 'title': 'Real-time model base fault diagnosis of PV panels using statistical signal processing', 'creators': [{'creatorType': 'author', 'firstName': 'M.', 'lastName': 'Davarifar'}, {'creatorType': 'author', 'firstName': 'A.', 'lastName': 'Rabhi'}, {'creatorType': 'author', 'firstName': 'A.', 'lastName': 'El-Hajjaji'}, {'creatorType': 'author', 'firstName': 'M.', 'lastName': 'Dahmane'}], 'abstractNote': 'This paper proposes new method of monitoring and fault detection in photovoltaic systems, based mainly on the analysis of the power losses of the photovoltaic system (PV) by using statistical signal processing. Firstly, real time new universal circuit based model of photovoltaic panels is presented. Then, the development of software fault detection on a real installation is performed under the MATLAB/Simulink environment. With model based fault diagnosis analysis, residual signal from comparing Simulink and real model is generated. To observe clear alarm signal from arbitrary data captured, Wald test technic is applied on residual signal. A model residual based on Sequential Probability Ratio Test (WSPRT) framework for electrical fault diagnosis in PV system is introduced.', 'date': '2013-10', 'proceedingsTitle': '2013 International Conference on Renewable Energy Research and Applications (ICRERA)', 'conferenceName': '2013 International Conference on Renewable Energy Research and Applications (ICRERA)', 'place': '', 'publisher': '', 'volume': '', 'pages': '599-604', 'series': '', 'language': '', 'DOI': '10.1109/ICRERA.2013.6749826', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/6749826', 'accessDate': '2025-06-09T18:15:09Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'Integrated circuit modeling', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'Monitoring', 'type': 1}, {'tag': 'PV system', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Renewable energy sources', 'type': 1}, {'tag': 'faults diagnosis', 'type': 1}, {'tag': 'real time modeling', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T18:15:09Z', 'dateModified': '2025-06-09T18:15:09Z'}\n",
      "{'key': 'ZLE6HLEH', 'version': 4900, 'itemType': 'conferencePaper', 'title': 'Fault Detection and Diagnosis of Photovoltaic Systems through I-V Curve Analysis', 'creators': [{'creatorType': 'author', 'firstName': 'Batoul', 'lastName': 'Zbib'}, {'creatorType': 'author', 'firstName': 'Hiba', 'lastName': 'Al Sheikh'}], 'abstractNote': 'This work presents an algorithm to detect and diagnose faults in PhotoVoltaic (PV) systems based on the I-V curve analysis. Three types of faults are investigated: mismatch and shading faults, connectivity faults and short circuit faults. The PV system is modeled using MATLAB/Simulink to simulate the faulty I-V curve behavior for each fault. During each simulation, the I-V curve is examined and compared with that during normal operation (without faults), in order to identify and characterize the anomalies. The different faulty modes affect the I-V characteristics of the PV string in different ways, leaving distinct signatures during its operation. Four attributes are extracted allowing for classification of faults into five classes. For some classes involving more than one fault, further analysis and comparison is carried out to allow discrimination between them. Results show that the proposed technique has good performance in detecting faults even if they are not severe.', 'date': '2020-06', 'proceedingsTitle': '2020 International Conference on Electrical, Communication, and Computer Engineering (ICECCE)', 'conferenceName': '2020 International Conference on Electrical, Communication, and Computer Engineering (ICECCE)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-6', 'series': '', 'language': '', 'DOI': '10.1109/ICECCE49384.2020.9179390', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9179390', 'accessDate': '2025-06-09T18:04:29Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Circuit faults', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'I-V curve analysis', 'type': 1}, {'tag': 'Integrated circuit modeling', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'PV string', 'type': 1}, {'tag': 'Radiation effects', 'type': 1}, {'tag': 'Temperature', 'type': 1}, {'tag': 'fault detection and diagnosis', 'type': 1}, {'tag': 'faults', 'type': 1}, {'tag': 'photovoltaic systems', 'type': 1}, {'tag': 'symptoms', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T18:04:29Z', 'dateModified': '2025-06-09T18:04:29Z'}\n",
      "{'key': 'JYI5TEC6', 'version': 4896, 'itemType': 'journalArticle', 'title': 'Identifying PV Module Mismatch Faults by a Thermography-Based Temperature Distribution Analysis', 'creators': [{'creatorType': 'author', 'firstName': 'Yihua', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Wenping', 'lastName': 'Cao'}, {'creatorType': 'author', 'firstName': 'Jien', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Stephen J.', 'lastName': 'Finney'}, {'creatorType': 'author', 'firstName': 'David', 'lastName': 'Li'}], 'abstractNote': 'Photovoltaic (PV) solar power generation is proven to be effective and sustainable but is currently hampered by relatively high costs and low conversion efficiency. This paper addresses both issues by presenting a low-cost and efficient temperature distribution analysis for identifying PV module mismatch faults by thermography. Mismatch faults reduce the power output and cause potential damage to PV cells. This paper first defines three fault categories in terms of fault levels, which lead to different terminal characteristics of the PV modules. The investigation of three faults is also conducted analytically and experimentally, and maintenance suggestions are also provided for different fault types. The proposed methodology is developed to combine the electrical and thermal characteristics of PV cells subjected to different fault mechanisms through simulation and experimental tests. Furthermore, the fault diagnosis method can be incorporated into the maximum power point tracking schemes to shift the operating point of the PV string. The developed technology has improved over the existing ones in locating the faulty cell by a thermal camera, providing a remedial measure, and maximizing the power output under faulty conditions.', 'publicationTitle': 'IEEE Transactions on Device and Materials Reliability', 'volume': '14', 'issue': '4', 'pages': '951-960', 'date': '2014-12', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TDMR.2014.2348195', 'ISSN': '1558-2574', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/6879295', 'accessDate': '2025-06-09T17:25:13Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Arrays', 'type': 1}, {'tag': 'Cameras', 'type': 1}, {'tag': 'Circuit faults', 'type': 1}, {'tag': 'Degradation', 'type': 1}, {'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'Lighting', 'type': 1}, {'tag': 'Temperature distribution', 'type': 1}, {'tag': 'Temperature measurement', 'type': 1}, {'tag': 'fault diagnosis', 'type': 1}, {'tag': 'photovoltaic (PV) power systems', 'type': 1}, {'tag': 'temperature', 'type': 1}, {'tag': 'thermography', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T17:25:13Z', 'dateModified': '2025-06-09T17:25:13Z'}\n",
      "{'key': '75I5AK94', 'version': 4892, 'itemType': 'journalArticle', 'title': 'Deep residual network based fault detection and diagnosis of photovoltaic arrays using current-voltage curves and ambient conditions基于深度残差网络的故障检测，以及使用电流-电压曲线和环境条件对光伏阵列进行诊断', 'creators': [{'creatorType': 'author', 'firstName': 'Zhicong', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yixiang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Lijun', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Shuying', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Peijie', 'lastName': 'Lin'}], 'abstractNote': 'Automatic fault detection and diagnosis techniques for photovoltaic arrays are crucial to promote the efficiency, reliability and safety of photovoltaic systems. In recent decades, many conventional artificial intelligence approaches have been successfully applied to automatically establish fault detection and diagnosis model using fault data samples, but most of them rely on manual feature extraction or expert knowledge to build diagnosis models, which is inefficient and may ignore some potential useful features. In addition, they usually use shallow neural networks with limited performance. Addressing the issues, this paper proposes a novel intelligent fault detection and diagnosis method for photovoltaic arrays based on a newly designed deep residual network model trained by the adaptive moment estimation deep learning algorithm, which can automatically extract features from raw current-voltage curves and ambient irradiance and temperature, and effectively improve the performance with a deeper network. In order to validate the proposed fault diagnosis model, a Simulink based simulation model is designed for a real laboratory photovoltaic array, and both fault simulation and real experiments are carried out to obtain simulation and experimental fault datasets. Furthermore, two other popular deep learning based models are used for comparison, including convolution neural network and convolutional auto-encoder. Both of simulation and real experimental comparison results demonstrate that the proposed deep residual network based method achieves high and best overall performance in terms of accuracy, generalization performance, reliability and training efficiency.', 'publicationTitle': 'Energy Conversion and Management', 'volume': '198', 'issue': '', 'pages': '111793', 'date': '2019-10-15', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Energy Conversion and Management', 'language': '', 'DOI': '10.1016/j.enconman.2019.111793', 'ISSN': '0196-8904', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0196890419307757', 'accessDate': '2025-06-09T17:09:16Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Current-voltage characteristic curves', 'type': 1}, {'tag': 'Deep learning', 'type': 1}, {'tag': 'Deep residual networks', 'type': 1}, {'tag': 'Fault detection and diagnosis', 'type': 1}, {'tag': 'Photovoltaic arrays', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T17:09:16Z', 'dateModified': '2025-06-09T17:09:16Z'}\n",
      "{'key': 'LJEXX5AG', 'version': 4890, 'itemType': 'journalArticle', 'title': 'Reliable fault detection and diagnosis of photovoltaic systems based on statistical monitoring approaches', 'creators': [{'creatorType': 'author', 'firstName': 'Fouzi', 'lastName': 'Harrou'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Bilal', 'lastName': 'Taghezouit'}, {'creatorType': 'author', 'firstName': 'Ahmed', 'lastName': 'Saidi'}, {'creatorType': 'author', 'firstName': 'Mohamed-Elkarim', 'lastName': 'Hamlati'}], 'abstractNote': '', 'publicationTitle': 'Renewable Energy', 'volume': '116', 'issue': '', 'pages': '22-37', 'date': '02/2018', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Renewable Energy', 'language': 'en', 'DOI': '10.1016/j.renene.2017.09.048', 'ISSN': '09601481', 'shortTitle': '', 'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0960148117309114', 'accessDate': '2025-06-09T16:44:50Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-06-09T16:44:50Z', 'dateModified': '2025-06-09T16:44:50Z'}\n",
      "{'key': '4AI77F58', 'version': 5062, 'itemType': 'journalArticle', 'title': 'FBRT-YOLO: Faster and Better for Real-Time Aerial Image Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yao', 'lastName': 'Xiao'}, {'creatorType': 'author', 'firstName': 'Tingfa', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Xin'}, {'creatorType': 'author', 'firstName': 'Jianan', 'lastName': 'Li'}], 'abstractNote': 'Embedded flight devices with visual capabilities have become essential for a wide range of applications. \\nIn aerial image detection, while many existing methods have partially addressed the issue of small target detection, challenges remain in optimizing small target detection and balancing detection accuracy with efficiency.\\nThese issues are key obstacles to the advancement of real-time aerial image detection.\\nIn this paper, we propose a new family of real-time detectors for aerial image detection, named FBRT-YOLO, to address the imbalance between detection accuracy and efficiency. Our method comprises two lightweight modules: Feature Complementary Mapping Module (FCM) and Multi-Kernel Perception Unit (MKP), designed to enhance object perception for small targets in aerial images.\\nFCM focuses on alleviating the problem of information imbalance caused by the loss of small target information in deep networks. It aims to integrate spatial positional information of targets more deeply into the network, better aligning with semantic information in the deeper layers to improve the localization of small targets.\\nWe introduce MKP, which leverages convolutions with kernels of different sizes to enhance the relationships between targets of various scales and improve the perception of targets at different scales.\\nExtensive experimental results on three major aerial image datasets, including Visdrone, UAVDT, and AI-TOD, demonstrate that FBRT-YOLO outperforms various real-time detectors in terms of performance and speed.', 'publicationTitle': 'Proceedings of the AAAI Conference on Artificial Intelligence', 'volume': '39', 'issue': '8', 'pages': '8673-8681', 'date': '2025-04-11', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.1609/aaai.v39i8.32937', 'ISSN': '2374-3468', 'shortTitle': 'FBRT-YOLO', 'url': 'https://ojs.aaai.org/index.php/AAAI/article/view/32937', 'accessDate': '2025-05-23T00:58:33Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ojs.aaai.org', 'callNumber': '', 'rights': 'Copyright (c) 2025 Association for the Advancement of Artificial Intelligence', 'extra': 'Number: 8', 'tags': [], 'collections': ['49A7H9UF'], 'relations': {}, 'dateAdded': '2025-05-23T00:58:33Z', 'dateModified': '2025-05-23T00:58:35Z'}\n",
      "{'key': 'EG7STRSR', 'version': 4798, 'itemType': 'journalArticle', 'title': 'Fast fault detection method for photovoltaic arrays with adaptive deep multiscale feature enhancement', 'creators': [{'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Gong'}, {'creatorType': 'author', 'firstName': 'Aimin', 'lastName': 'An'}, {'creatorType': 'author', 'firstName': 'Yaoke', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Xuemin', 'lastName': 'Zhang'}], 'abstractNote': 'Photovoltaic (PV) arrays have output characteristics such as randomness and intermittency, and faults can seriously affect the safe operation of the power system. In order to improve the comprehensive performance of the PV array fault diagnosis model, a new intelligent online fault monitoring method for PV arrays is proposed in this paper. (1) a three-dimensional channel feature map based on I, V, and P features is constructed because the I-V and P curves of the PV array have significantly different effects under different fault conditions. (2) The PV array fault diagnosis model based on a multi-source information fusion network (MIFNet) is proposed, and Channel Mixing Convolution (CMC) module, three-dimensional feature attention enhancement (TDFAE) module, and Channel normalized scaling (CNS) module are designed to improve the comprehensive performance of the model. (3) An adaptive nonlinear mutual sparrow search algorithm (ANMSSA) is proposed to optimize the hyperparameter configuration of the MIFNet network. The experimental results show that the average recognition accuracy, prediction accuracy, and sensitivity of the ANMSSA-MIFNet network proposed in this paper are 99.64%, 99.64%, and 99.71% respectively. When facing single-component faults and multi-component faults, the model has stronger diagnostic accuracy, robustness, anti-noise ability, and stability, and can efficiently diagnose different faults of PV arrays, providing the scientific basis and theoretical support for the operation of PV systems.', 'publicationTitle': 'Applied Energy', 'volume': '353', 'issue': '', 'pages': '122071', 'date': '2024-01-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2023.122071', 'ISSN': '0306-2619', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0306261923014356', 'accessDate': '2025-05-06T16:00:36Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'Improved sparrow optimization algorithm', 'type': 1}, {'tag': 'Multi-scale feature fusion', 'type': 1}, {'tag': 'Photovoltaic arrays', 'type': 1}, {'tag': 'Three-dimensional feature attention enhancement module', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-05-06T16:00:36Z', 'dateModified': '2025-05-06T16:00:36Z'}\n",
      "{'key': 'SWHT8BT8', 'version': 4792, 'itemType': 'journalArticle', 'title': 'An unsupervised hourly weather status pattern recognition and blending fitting model for PV system fault detection', 'creators': [{'creatorType': 'author', 'firstName': 'Jiaqi', 'lastName': 'Qu'}, {'creatorType': 'author', 'firstName': 'Zheng', 'lastName': 'Qian'}, {'creatorType': 'author', 'firstName': 'Yan', 'lastName': 'Pei'}, {'creatorType': 'author', 'firstName': 'Lu', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Hamidreza', 'lastName': 'Zareipour'}, {'creatorType': 'author', 'firstName': 'Qiang', 'lastName': 'Sun'}], 'abstractNote': 'Detecting PV system faults in a timely fashion is important to ensure the safe operation of equipment and reduce their impact on the economy of the PV systems. It is necessary to further improve the time-sensitive performance evaluation of the system. However, the hourly weather scenario segmentations are seldom considered during the hour-level online monitoring process. Therefore, a hybrid method based on unsupervised hourly weather status pattern recognition and blending fitting model is proposed for hourly fault detection to improve the performance evaluation of PV systems. The proposed solution includes three parts, firstly, in the data preprocessing stage, the measured power with the errors and noise under normal operation situation caused by the environment changes is corrected by monthly linear fitting. Secondly, an unsupervised hourly weather status pattern recognition method is constructed using the measured radiation data, including unsupervised clustering and the Multiclass-GBDT-LR classification process. Finally, after eliminating the anomalies and errors, the blending fitting model of the hourly sub-weather status is established. Through the analysis of power plants in Australia and China, the proposed solutions are validated and evaluated to be superior to existing data-driven solutions in terms of fitting accuracy, detection validity, and response time. Numerical results of case studies indicate that the developed methodology under sub-weather has improved the detection accuracy up to 97.71% and 99.29% compared to benchmark models.', 'publicationTitle': 'Applied Energy', 'volume': '319', 'issue': '', 'pages': '119271', 'date': '2022-08-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2022.119271', 'ISSN': '0306-2619', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0306261922006286', 'accessDate': '2025-04-26T08:20:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Blending fitting model', 'type': 1}, {'tag': 'Hourly fault detection', 'type': 1}, {'tag': 'Photovoltaic systems performance', 'type': 1}, {'tag': 'Unsupervised hourly weather status pattern recognition', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-26T08:20:00Z', 'dateModified': '2025-04-26T08:20:01Z'}\n",
      "{'key': '5LWU8XIS', 'version': 4788, 'itemType': 'journalArticle', 'title': 'Photovoltaic Bypass Diode Fault Detection Using Artificial Neural Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Mahmoud', 'lastName': 'Dhimish'}, {'creatorType': 'author', 'firstName': 'Andy M.', 'lastName': 'Tyrrell'}], 'abstractNote': 'Due to the importance of determining faulty bypass diodes in photovoltaic (PV) systems, faulty bypass diodes have been of widespread interest in recent years due to their importance in improving PV system durability, operation, and overall safety. This article presents new work in developing an artificial intelligence (AI) based model using the principles of artificial neural networks (ANNs) to detect short and open PV bypass diode fault conditions. With only three inputs from the PV system, namely, the output power, short-circuit current, and open-circuit voltage, the developed ANN model can determine whether the PV bypass diodes are defective. In the experimentally validated case of short and open bypass diodes, 93.6% and 93.3% of faulty bypass diodes can be detected. Furthermore, the developed ANN model has an average precision and sensitivity of 96.4% and 92.6%, respectively.', 'publicationTitle': 'IEEE Transactions on Instrumentation and Measurement', 'volume': '72', 'issue': '', 'pages': '1-10', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TIM.2023.3244230', 'ISSN': '1557-9662', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/10042455', 'accessDate': '2025-04-25T14:29:39Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Artificial intelligence (AI)', 'type': 1}, {'tag': 'Artificial neural networks', 'type': 1}, {'tag': 'Circuit faults', 'type': 1}, {'tag': 'Fault diagnosis', 'type': 1}, {'tag': 'Neurons', 'type': 1}, {'tag': 'Photovoltaic cells', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Temperature measurement', 'type': 1}, {'tag': 'bypass diodes', 'type': 1}, {'tag': 'fault detection algorithm', 'type': 1}, {'tag': 'photovoltaics (PVs)', 'type': 1}], 'collections': ['NGZJGTG9'], 'relations': {}, 'dateAdded': '2025-04-25T14:29:40Z', 'dateModified': '2025-04-25T14:29:40Z'}\n",
      "{'key': 'D75IUGW9', 'version': 4779, 'itemType': 'journalArticle', 'title': 'Faults detection and diagnosis of PV systems based on machine learning approach using random forest classifier', 'creators': [{'creatorType': 'author', 'firstName': 'Ahmed Faris', 'lastName': 'Amiri'}, {'creatorType': 'author', 'firstName': 'Houcine', 'lastName': 'Oudira'}, {'creatorType': 'author', 'firstName': 'Aissa', 'lastName': 'Chouder'}, {'creatorType': 'author', 'firstName': 'Sofiane', 'lastName': 'Kichou'}], 'abstractNote': 'Accurate and reliable fault detection procedures are crucial for optimizing photovoltaic (PV) system performance. Establishing a trustworthy PV array model is the primary step and a vital tool for monitoring and diagnosing PV systems. This paper outlines a two-step approach for creating a reliable PV array model and implementing a fault detection procedure using Random Forest Classifiers (RFCs). Firstly, we extracted the five unknown parameters of the one-diode model (ODM) by combining the current–voltage translation method to predict the reference curve and employing the modified grey wolf optimization (MGWO) algorithm. In the second step, we simulated the PV array to obtain maximum power point (MPP) coordinates and construct operational databases through co-simulations in PSIM/MATLAB. We developed two RFCs: one for fault detection (a binary classifier) and another for fault diagnosis (a multiclass classifier). Our results confirmed the accuracy of the PV array modeling approach. We achieved a root mean square error (RMSE) value of 0.0122 for the ODM parameter extraction and RMSEs lower than 0.3 in dynamic PV array output current simulations under cloudy conditions. Regarding the fault detection procedure, our results demonstrate exceptional classification accuracy rates of 99.4% for both fault detection and diagnosis, surpassing other tested models like Support Vector Machines (SVM), K-Nearest Neighbors (KNN), Neural Networks (MLP Classifier), Decision Trees (DT), and Stochastic Gradient Descent (SGDC).', 'publicationTitle': 'Energy Conversion and Management', 'volume': '301', 'issue': '', 'pages': '118076', 'date': '2024-02-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Energy Conversion and Management', 'language': '', 'DOI': '10.1016/j.enconman.2024.118076', 'ISSN': '0196-8904', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0196890424000177', 'accessDate': '2025-04-25T12:46:06Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Co-simulation', 'type': 1}, {'tag': 'Dynamic MPP model', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'MGWO algorithm', 'type': 1}, {'tag': 'Parameter extraction', 'type': 1}, {'tag': 'Random forest classifier', 'type': 1}], 'collections': ['NGZJGTG9'], 'relations': {}, 'dateAdded': '2025-04-25T12:46:06Z', 'dateModified': '2025-04-25T12:46:11Z'}\n",
      "{'key': 'UBNRKFHD', 'version': 4770, 'itemType': 'journalArticle', 'title': 'Failures of Photovoltaic modules and their Detection: A Review', 'creators': [{'creatorType': 'author', 'firstName': 'M.', 'lastName': 'Waqar Akram'}, {'creatorType': 'author', 'firstName': 'Guiqiang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Chen'}], 'abstractNote': 'Photovoltaic (PV) has emerged as a promising and phenomenal renewable energy technology in the recent past and the PV market has developed at an exponential rate during the time. However, a large number of early failure and degradation cases are also observed in the field. Besides these, there are fire risks associated with PV modules installed in the field, roof-mounted and building integrated PV systems, as modules contain combustible materials. The fire is caused by different failures and faults such as electrical arcs, short circuits, and hotspots. The timely, fast and accurate detection and measurement of failures is important to produce efficient and durable modules. Conventional visual monitoring and assessment process is commonly used in the field, which is mainly dependent upon human abilities and often involve human error. Moreover, it is only practicable on small-scale and requires long time. With the rising use of PV solar energy and ongoing installation of large-scale PV power plants worldwide, the automation of PV monitoring and assessment methods becomes important. Here, the present paper focuses on module failures, fire risks associated with PV modules, failure detection/measurements, and computer/machine vision or artificial intelligence (AI) based failure detection in PV modules; and can serve as a one-stop source for PV system inspectors. All types of failures occurred in PV modules including recent reported field failures are discussed in the paper. The fire risks associated with PV modules and reduction of fire risks and hotspots is also discussed. Different failure detection methods and recent advancements in these methods are presented. The strengths and limitations of each method is summarized. Moreover, the studies conducted on combined application and comparison of different methods are extensively reviewed. The boundary conditions of applications of different failure detection methods are provided which helps in selection of appropriate method. Subsequent to this, automatic techniques are introduced and their implementation and applications are discussed. The strengths and limitations of different automatic techniques and their applicability with respect to different conditions is discussed. This study may act as a one-stop guide for: acquiring information about module structure and failures, mitigation of fire risks and hotspots, selection of appropriate characterization method, application of different methods, automation of detection tasks, and remote PV plant inspection. The PV sector is at the start of AI journey and has a long path to go. The present paper is a significant step in the AI journey. The existing knowledge is organized systematically in a handy manner, thereby can facilitates new developments in AI-related research, fire risks mitigation, and failure detection.', 'publicationTitle': 'Applied Energy', 'volume': '313', 'issue': '', 'pages': '118822', 'date': '2022-05-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2022.118822', 'ISSN': '0306-2619', 'shortTitle': 'Failures of Photovoltaic modules and their Detection', 'url': 'https://www.sciencedirect.com/science/article/pii/S0306261922002677', 'accessDate': '2025-04-25T08:06:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Artificial intelligence and deep learning', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Electroluminescence and Infrared imaging', 'type': 1}, {'tag': 'Module failures and fire risks', 'type': 1}, {'tag': 'Photovoltaic cells', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-25T08:06:12Z', 'dateModified': '2025-04-25T08:06:14Z'}\n",
      "{'key': 'MF998B3M', 'version': 4763, 'itemType': 'journalArticle', 'title': 'An innovative transformer neural network for fault detection and classification for photovoltaic modules', 'creators': [{'creatorType': 'author', 'firstName': 'E. A.', 'lastName': 'Ramadan'}, {'creatorType': 'author', 'firstName': 'Nada M.', 'lastName': 'Moawad'}, {'creatorType': 'author', 'firstName': 'Belal A.', 'lastName': 'Abouzalm'}, {'creatorType': 'author', 'firstName': 'Ali A.', 'lastName': 'Sakr'}, {'creatorType': 'author', 'firstName': 'Wessam F.', 'lastName': 'Abouzaid'}, {'creatorType': 'author', 'firstName': 'Ghada M.', 'lastName': 'El-Banby'}], 'abstractNote': 'Solar energy from photovoltaic systems (PV) ranks as the third greatest renewable electricity generation resource, expanding quickly through the years as it is free from environmental pollution and has cheap installation costs. Effective performance at high working rates is contingent on the early failure detection of PV modules. This study introduces an innovative deep learning model utilizing a Vision Transformer (ViT) artificial neural network (ANN) for the automatic detection of faults in infrared thermography (IR) images of PV modules. Our approach aims to enhance the accuracy of PV fault detection and classification compared to existing deep learning methods. The proposed framework encompasses three primary stages: (1) image preprocessing, which includes the application of the unsharp mask to sharpen the image’s edges or high-frequency components; (2) data augmentation techniques designed to overcome the problem of unbalanced classes that affect the training process, resulting in learning specific majority classes better than others; and (3) implementing a Vision Transformer deep learning model for its precision in digital image analysis. We evaluated the framework using the public Infrared Solar Modules dataset. The performance was quantitatively assessed using several metrics: accuracy, recall, precision, and F1 score. The dataset is classified into eleven different PV anomalies and another class of no-anomaly PV modules. The results show that our proposed approach has 98.23% accuracy for classifying the dataset into two classes, one for the PV anomaly and the other for the no-anomaly. It also has 96.19% accuracy for classifying eleven PV failures and 95.55% for twelve classes, including the no-anomaly class with the eleven types of anomalies. The experimental results underscore the potential of our model for earlier and more precise detection of PV faults. Furthermore, comparative analysis revealed the superior performance of the proposed approach over other deep learning methods.', 'publicationTitle': 'Energy Conversion and Management', 'volume': '314', 'issue': '', 'pages': '118718', 'date': '2024-08-15', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Energy Conversion and Management', 'language': '', 'DOI': '10.1016/j.enconman.2024.118718', 'ISSN': '0196-8904', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0196890424006599', 'accessDate': '2025-04-25T03:50:14Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Artificial Intelligence', 'type': 1}, {'tag': 'Fault Detection System', 'type': 1}, {'tag': 'Photovoltaic (PV) systems', 'type': 1}, {'tag': 'Thermography', 'type': 1}, {'tag': 'Vision Transformer', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-25T03:50:14Z', 'dateModified': '2025-04-25T03:50:14Z'}\n",
      "{'key': 'VN9DT7GP', 'version': 4759, 'itemType': 'journalArticle', 'title': 'Fault detection and monitoring systems for photovoltaic installations: A review', 'creators': [{'creatorType': 'author', 'firstName': 'Asma', 'lastName': 'Triki-Lahiani'}, {'creatorType': 'author', 'firstName': 'Afef', 'lastName': 'Bennani-Ben Abdelghani'}, {'creatorType': 'author', 'firstName': 'Ilhem', 'lastName': 'Slama-Belkhodja'}], 'abstractNote': \"As any energy production system, photovoltaic (PV) installations have to be monitored to enhance system performances and to early detect failures for more reliability. There are several photovoltaic monitoring strategies based on the output of the plant and its nature. Monitoring can be performed locally on site or remotely. It measures production, focuses also on verification and follow-up of converter and communication devices' effective operation. Up to now, some faults diagnosis methods for PV components and systems have been developed. However, given the evolution of PV installations, more advanced monitoring techniques are continuously under investigation. In this paper, major photovoltaic system failures are addressed. Then techniques for photovoltaic monitoring proposed in recent literature are overviewed and analyzed to point out their differences, advantages and limits.\", 'publicationTitle': 'Renewable and Sustainable Energy Reviews', 'volume': '82', 'issue': '', 'pages': '2680-2692', 'date': '2018-02-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Renewable and Sustainable Energy Reviews', 'language': '', 'DOI': '10.1016/j.rser.2017.09.101', 'ISSN': '1364-0321', 'shortTitle': 'Fault detection and monitoring systems for photovoltaic installations', 'url': 'https://www.sciencedirect.com/science/article/pii/S1364032117313618', 'accessDate': '2025-04-25T03:37:02Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Diagnosis', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'PV monitoring', 'type': 1}, {'tag': 'PV systems', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-25T03:37:02Z', 'dateModified': '2025-04-25T03:37:02Z'}\n",
      "{'key': 'K56497DC', 'version': 4754, 'itemType': 'journalArticle', 'title': 'Application of Artificial Neural Networks to photovoltaic fault detection and diagnosis: A review', 'creators': [{'creatorType': 'author', 'firstName': 'B.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'C.', 'lastName': 'Delpha'}, {'creatorType': 'author', 'firstName': 'D.', 'lastName': 'Diallo'}, {'creatorType': 'author', 'firstName': 'A.', 'lastName': 'Migan-Dubois'}], 'abstractNote': 'The rapid development of photovoltaic (PV) technology and the growing number and size of PV power plants require increasingly efficient and intelligent health monitoring strategies to ensure reliable operation and high energy availability. Among the various techniques, Artificial Neural Network (ANN) has exhibited the functional capacity to perform the identification and classification of PV faults. In the present review, a systematic study on the application of ANN and hybridized ANN models for PV fault detection and diagnosis (FDD) is conducted. For each application, the targeted PV faults, the detectable faults, the type and amount of data used, the model configuration and the FDD performance are extracted, and analyzed. The main trends, challenges and prospects for the application of ANN for PV FDD are extracted and presented.', 'publicationTitle': 'Renewable and Sustainable Energy Reviews', 'volume': '138', 'issue': '', 'pages': '110512', 'date': '2021-03-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Renewable and Sustainable Energy Reviews', 'language': '', 'DOI': '10.1016/j.rser.2020.110512', 'ISSN': '1364-0321', 'shortTitle': 'Application of Artificial Neural Networks to photovoltaic fault detection and diagnosis', 'url': 'https://www.sciencedirect.com/science/article/pii/S136403212030798X', 'accessDate': '2025-04-25T03:27:34Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Artificial neural network', 'type': 1}, {'tag': 'Deep learning', 'type': 1}, {'tag': 'Fault classification', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Machine learning', 'type': 1}, {'tag': 'Photovoltaic', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-25T03:27:35Z', 'dateModified': '2025-04-25T03:27:35Z'}\n",
      "{'key': '2LVILDIX', 'version': 4749, 'itemType': 'journalArticle', 'title': 'MambaSOD: Dual Mamba-driven cross-modal fusion network for RGB-D Salient Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Zhan'}, {'creatorType': 'author', 'firstName': 'Zhihong', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Haijun', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiaoheng', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Yinli', 'lastName': 'Tian'}], 'abstractNote': 'The purpose of RGB-D Salient Object Detection (SOD) is to pinpoint the most visually conspicuous areas within images accurately. Numerous conventional models heavily rely on CNN and overlook the long-range contextual dependencies, subsequent transformer-based models have addressed the issue to some extent but introduce quadratic computational complexity. Moreover, incorporating spatial information from depth maps has been proven effective for this task and the primary challenge is how to effectively fuse the complementary information from RGB and depth. Recent advancements in Mamba, particularly its superior ability to perform long-range modeling within linear efficiency, have motivated our exploration of its potential in the RGB-D SOD task. In this paper, we propose a dual Mamba-driven cross-modal fusion network for RGB-D SOD, named MambaSOD, which effectively leverages Mamba’s long-range dependency modeling capability. Specifically, we employ a dual Mamba-driven feature extractor to process RGB and depth inputs to obtain features with global contextual information. Then, we design a cross-modal fusion Mamba to perform modality-specific feature enhancement and model the inter-modal correlation between the RGB and depth features. To the best of our knowledge, this work is an innovative attempt to explore the potential of the pure Mamba in the RGB-D SOD task, offering a novel perspective. Numerous experiments conducted on seven prevailing datasets demonstrate our method’s superiority over eighteen state-of-the-art RGB-D SOD models. The source code will be released at https://github.com/YueZhan721/MambaSOD.', 'publicationTitle': 'Neurocomputing', 'volume': '631', 'issue': '', 'pages': '129718', 'date': '2025-05-28', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Neurocomputing', 'language': '', 'DOI': '10.1016/j.neucom.2025.129718', 'ISSN': '0925-2312', 'shortTitle': 'MambaSOD', 'url': 'https://www.sciencedirect.com/science/article/pii/S092523122500390X', 'accessDate': '2025-04-22T05:46:31Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Cross-modal Fusion Mamba', 'type': 1}, {'tag': 'Mamba-based backbone', 'type': 1}, {'tag': 'RGB-D salient object detection', 'type': 1}, {'tag': 'State Space Model', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-22T05:46:31Z', 'dateModified': '2025-04-22T05:46:31Z'}\n",
      "{'key': 'LKZKLJZ6', 'version': 5088, 'itemType': 'preprint', 'title': 'RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer', 'creators': [{'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Yian', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Qinyao', 'lastName': 'Chang'}, {'creatorType': 'author', 'firstName': 'Kui', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Guanzhong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Liu'}], 'abstractNote': 'In this report, we present RT-DETRv2, an improved Real-Time DEtection TRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art real-time detector, RT-DETR, and opens up a set of bag-of-freebies for flexibility and practicality, as well as optimizing the training strategy to achieve enhanced performance. To improve the flexibility, we suggest setting a distinct number of sampling points for features at different scales in the deformable attention to achieve selective multi-scale feature extraction by the decoder. To enhance practicality, we propose an optional discrete sampling operator to replace the grid_sample operator that is specific to RT-DETR compared to YOLOs. This removes the deployment constraints typically associated with DETRs. For the training strategy, we propose dynamic data augmentation and scale-adaptive hyperparameters customization to improve performance without loss of speed. Source code and pre-trained models will be available at https://github.com/lyuwenyu/RT-DETR.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2407.17140', 'place': '', 'date': '2024-07-24', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2407.17140', 'citationKey': '', 'url': 'http://arxiv.org/abs/2407.17140', 'accessDate': '2025-04-21T15:52:19Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'RT-DETRv2', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2407.17140 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-04-21T15:52:19Z', 'dateModified': '2025-04-21T15:52:19Z'}\n",
      "{'key': '9CI88KBT', 'version': 4742, 'itemType': 'preprint', 'title': 'U-Mamba: Enhancing Long-range Dependency for Biomedical Image Segmentation', 'creators': [{'creatorType': 'author', 'firstName': 'Jun', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Feifei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Bo', 'lastName': 'Wang'}], 'abstractNote': 'Convolutional Neural Networks (CNNs) and Transformers have been the most popular architectures for biomedical image segmentation, but both of them have limited ability to handle long-range dependencies because of inherent locality or computational complexity. To address this challenge, we introduce U-Mamba, a general-purpose network for biomedical image segmentation. Inspired by the State Space Sequence Models (SSMs), a new family of deep sequence models known for their strong capability in handling long sequences, we design a hybrid CNN-SSM block that integrates the local feature extraction power of convolutional layers with the abilities of SSMs for capturing the long-range dependency. Moreover, U-Mamba enjoys a self-configuring mechanism, allowing it to automatically adapt to various datasets without manual intervention. We conduct extensive experiments on four diverse tasks, including the 3D abdominal organ segmentation in CT and MR images, instrument segmentation in endoscopy images, and cell segmentation in microscopy images. The results reveal that U-Mamba outperforms state-of-the-art CNN-based and Transformer-based segmentation networks across all tasks. This opens new avenues for efficient long-range dependency modeling in biomedical image analysis. The code, models, and data are publicly available at https://wanglab.ai/u-mamba.html.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2401.04722', 'place': '', 'date': '2024-01-09', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2401.04722', 'citationKey': '', 'url': 'http://arxiv.org/abs/2401.04722', 'accessDate': '2025-04-21T09:23:19Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'U-Mamba', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2401.04722 [eess]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}, {'tag': 'Electrical Engineering and Systems Science - Image and Video Processing', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-21T09:23:19Z', 'dateModified': '2025-04-21T09:23:19Z'}\n",
      "{'key': '6H8WDUYH', 'version': 4739, 'itemType': 'preprint', 'title': 'Fusion-Mamba for Cross-modality Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Wenhao', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Haodong', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Shaohui', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Xiaoyan', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Yunhang', 'lastName': 'Shen'}, {'creatorType': 'author', 'firstName': 'Xuhui', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Juan', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Guodong', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Baochang', 'lastName': 'Zhang'}], 'abstractNote': 'Cross-modality fusing complementary information from different modalities effectively improves object detection performance, making it more useful and robust for a wider range of applications. Existing fusion strategies combine different types of images or merge different backbone features through elaborated neural network modules. However, these methods neglect that modality disparities affect cross-modality fusion performance, as different modalities with different camera focal lengths, placements, and angles are hardly fused. In this paper, we investigate cross-modality fusion by associating cross-modal features in a hidden state space based on an improved Mamba with a gating mechanism. We design a Fusion-Mamba block (FMB) to map cross-modal features into a hidden state space for interaction, thereby reducing disparities between cross-modal features and enhancing the representation consistency of fused features. FMB contains two modules: the State Space Channel Swapping (SSCS) module facilitates shallow feature fusion, and the Dual State Space Fusion (DSSF) enables deep fusion in a hidden state space. Through extensive experiments on public datasets, our proposed approach outperforms the state-of-the-art methods on $m$AP with 5.9% on $M^3FD$ and 4.9% on FLIR-Aligned datasets, demonstrating superior object detection performance. To the best of our knowledge, this is the first work to explore the potential of Mamba for cross-modal fusion and establish a new baseline for cross-modality object detection.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2404.09146', 'place': '', 'date': '2024-04-14', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2404.09146', 'citationKey': '', 'url': 'http://arxiv.org/abs/2404.09146', 'accessDate': '2025-04-20T15:31:55Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2404.09146 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-20T15:31:55Z', 'dateModified': '2025-04-20T15:31:56Z'}\n",
      "{'key': 'KMEYJQ9F', 'version': 5041, 'itemType': 'preprint', 'title': 'DEIM: DETR with Improved Matching for Fast Convergence', 'creators': [{'creatorType': 'author', 'firstName': 'Shihua', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Zhichao', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Xiaodong', 'lastName': 'Cun'}, {'creatorType': 'author', 'firstName': 'Yongjun', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Xi', 'lastName': 'Shen'}], 'abstractNote': 'We introduce DEIM, an innovative and efficient training framework designed to accelerate convergence in real-time object detection with Transformer-based architectures (DETR). To mitigate the sparse supervision inherent in one-to-one (O2O) matching in DETR models, DEIM employs a Dense O2O matching strategy. This approach increases the number of positive samples per image by incorporating additional targets, using standard data augmentation techniques. While Dense O2O matching speeds up convergence, it also introduces numerous low-quality matches that could affect performance. To address this, we propose the Matchability-Aware Loss (MAL), a novel loss function that optimizes matches across various quality levels, enhancing the effectiveness of Dense O2O. Extensive experiments on the COCO dataset validate the efficacy of DEIM. When integrated with RT-DETR and D-FINE, it consistently boosts performance while reducing training time by 50%. Notably, paired with RT-DETRv2, DEIM achieves 53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally, DEIM-trained real-time models outperform leading real-time object detectors, with DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78 FPS on an NVIDIA T4 GPU, respectively, without the need for additional data. We believe DEIM sets a new baseline for advancements in real-time object detection. Our code and pre-trained models are available at https://github.com/ShihuaHuang95/DEIM.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.04234', 'place': '', 'date': '2025-03-26', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.04234', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.04234', 'accessDate': '2025-04-15T08:08:18Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DEIM', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.04234 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['AZZL5SNP', '9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-04-15T08:08:18Z', 'dateModified': '2025-04-15T08:08:18Z'}\n",
      "{'key': 'SXCS66RV', 'version': 5028, 'itemType': 'preprint', 'title': 'D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement', 'creators': [{'creatorType': 'author', 'firstName': 'Yansong', 'lastName': 'Peng'}, {'creatorType': 'author', 'firstName': 'Hebei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Peixi', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Yueyi', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xiaoyan', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Wu'}], 'abstractNote': 'We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: https://github.com/Peterande/D-FINE.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2410.13842', 'place': '', 'date': '2024-10-17', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2410.13842', 'citationKey': '', 'url': 'http://arxiv.org/abs/2410.13842', 'accessDate': '2025-04-15T08:08:15Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'D-FINE', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2410.13842 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['4KEZKRYY'], 'relations': {}, 'dateAdded': '2025-04-15T08:08:15Z', 'dateModified': '2025-04-15T08:08:16Z'}\n",
      "{'key': 'BFGNMI9P', 'version': 4732, 'itemType': 'conferencePaper', 'title': 'Lightweight and Efficient Distributed Photovoltaic Panel Defect Detection Model', 'creators': [{'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Gu'}, {'creatorType': 'author', 'firstName': 'Jianqi', 'lastName': 'Li'}], 'abstractNote': 'In the detection of defects in distributed photovoltaic (PV) panel, it is crucial to balance the high precision required for defect detection with the practical challenges of deploying models on low-resource devices. To address this challenge, this paper proposes the YOLOv8-PV model based on YOLOv8. Firstly, a shared-parameter detection head is designed to accelerate training and better learn defect features. A lightweight and high-performance PCONV is incorporated into the detection head to enhance higher throughput and lower memory access. Additionally, a Context Guided block is introduced to reduce computational complexity. Finally, an improved MLCA attention mechanism is added to enhance detection accuracy. Experimental results on a distributed photovoltaic panel dataset demonstrate that the YOLOv8-PV model achieves a reduction in Params, GFLOPS, and model size by 50.0%, 60.4%, and 46.6%, respectively, compared to the baseline model. Additionally, the mAP50 is improved by 1.8%.', 'date': '2024-09', 'proceedingsTitle': '2024 4th International Conference on Computer Science and Blockchain (CCSB)', 'conferenceName': '2024 4th International Conference on Computer Science and Blockchain (CCSB)', 'place': '', 'publisher': '', 'volume': '', 'pages': '31-35', 'series': '', 'language': '', 'DOI': '10.1109/CCSB63463.2024.10735631', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10735631/', 'accessDate': '2025-04-15T02:45:15Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Computational modeling', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Head', 'type': 1}, {'tag': 'Lightweight', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Performance evaluation', 'type': 1}, {'tag': 'Photovoltaic panel', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Throughput', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'YOLOv8', 'type': 1}, {'tag': 'defect detection', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-04-15T02:45:15Z', 'dateModified': '2025-04-15T02:45:16Z'}\n",
      "{'key': 'NX738VK5', 'version': 4725, 'itemType': 'journalArticle', 'title': 'FM-RTDETR: Small Object Detection Algorithm Based on Enhanced Feature Fusion with Mamba', 'creators': [{'creatorType': 'author', 'firstName': 'Yuchuan', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Jiahui', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Yong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yafei', 'lastName': 'Chen'}], 'abstractNote': \"Traditional real-time object detection networks deployed in unmanned aerial vehicles (UAVs) struggle to extract features from small objects in complex backgrounds with occlusions and overlapping objects. To address this challenge, we propose FM-RTDETR, a real-time object detection algorithm optimized for small object detection. We redesign the encoder of RT-DETRv2 by integrating the Feature Aggregation and Diffusion Network (FADN), improving the algorithm's ability to capture contextual information. Subsequently, we introduce the Parallel Atrous Mamba Feature Fusion Module (PAMFFM), which combines shallow and deep semantic information to better capture small object features. Furthermore, we propose the Cross-stage Enhanced Feature Fusion Module (CEFFM), merging features for small objects to provide richer and more detailed information. Finally, we propose STIoU Loss, which incorporates a penalty term to adjust the scaling of the loss function, thereby improving detection granularity for small objects. FM-RTDETR achieves AP<sub>50</sub> scores of 54.0% and 56.3% on the VisDrone2019-DET and AI-TOD datasets. Compared with other state-of-the-art methods, our method shows great potential in small object detection from drones. The code is available at https://github.com/Yyc1999super/FM-RTDETR.\", 'publicationTitle': 'IEEE Signal Processing Letters', 'volume': '', 'issue': '', 'pages': '1-5', 'date': '2025', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/LSP.2025.3553426', 'ISSN': '1558-2361', 'shortTitle': 'FM-RTDETR', 'url': 'https://ieeexplore.ieee.org/document/10935299/', 'accessDate': '2025-04-10T06:42:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Accuracy', 'type': 1}, {'tag': 'Artificial intelligence', 'type': 1}, {'tag': 'Convolution', 'type': 1}, {'tag': 'Decoding', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Feature fusion', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'RT-DETRv2', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Signal processing algorithms', 'type': 1}, {'tag': 'Small object detection', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'VisDrone', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-10T06:42:10Z', 'dateModified': '2025-04-10T06:42:10Z'}\n",
      "{'key': 'ALCVR4SU', 'version': 5091, 'itemType': 'conferencePaper', 'title': \"Run, Don't Walk: Chasing Higher FLOPS for Faster Neural Networks\", 'creators': [{'creatorType': 'author', 'firstName': 'Jierun', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shiu-hong', 'lastName': 'Kao'}, {'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Weipeng', 'lastName': 'Zhuo'}, {'creatorType': 'author', 'firstName': 'Song', 'lastName': 'Wen'}, {'creatorType': 'author', 'firstName': 'Chul-Ho', 'lastName': 'Lee'}, {'creatorType': 'author', 'firstName': 'S.-H. Gary', 'lastName': 'Chan'}], 'abstractNote': 'To design fast neural networks, many works have been focusing on reducing the number of floating-point operations (FLOPs). We observe that such reduction in FLOPs, however, does not necessarily lead to a similar level of reduction in latency. This mainly stems from inefficiently low floating-point operations per second (FLOPS). To achieve faster networks, we revisit popular operators and demonstrate that such low FLOPS is mainly due to frequent memory access of the operators, especially the depthwise convolution. We hence propose a novel partial convolution (PConv) that extracts spatial features more efficiently, by cutting down redundant computation and memory access simultaneously. Building upon our PConv, we further propose FasterNet, a new family of neural networks, which attains substantially higher running speed than others on a wide range of devices, without compromising on accuracy for various vision tasks. For example, on ImageNet1k, our tiny FasterNet-T0 is 2.8×, 3.3×, and 2.4× faster than MobileViT-XXS on GPU, CPU, and ARM processors, respectively, while being 2.9% more accurate. Our large FasterNet-L achieves impressive 83.5% top-1 accuracy, on par with the emerging Swin-B, while having 36% higher inference throughput on GPU, as well as saving 37% compute time on CPU. Code is available at https://github. com/JierunChen/FasterNet.', 'date': '6/2023', 'proceedingsTitle': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Vancouver, BC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '12021-12031', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52729.2023.01157', 'ISBN': '979-8-3503-0129-8', 'shortTitle': \"Run, Don't Walk\", 'url': 'https://ieeexplore.ieee.org/document/10203371/', 'accessDate': '2025-04-08T12:19:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-04-08T12:19:12Z', 'dateModified': '2025-04-08T12:19:12Z'}\n",
      "{'key': 'GRMIPQBA', 'version': 4716, 'itemType': 'preprint', 'title': 'No More Strided Convolutions or Pooling: A New CNN Building Block for Low-Resolution Images and Small Objects', 'creators': [{'creatorType': 'author', 'firstName': 'Raja', 'lastName': 'Sunkara'}, {'creatorType': 'author', 'firstName': 'Tie', 'lastName': 'Luo'}], 'abstractNote': 'Convolutional neural networks (CNNs) have made resounding success in many computer vision tasks such as image classification and object detection. However, their performance degrades rapidly on tougher tasks where images are of low resolution or objects are small. In this paper, we point out that this roots in a defective yet common design in existing CNN architectures, namely the use of strided convolution and/or pooling layers, which results in a loss of fine-grained information and learning of less effective feature representations. To this end, we propose a new CNN building block called SPD-Conv in place of each strided convolution layer and each pooling layer (thus eliminates them altogether). SPD-Conv is comprised of a space-to-depth (SPD) layer followed by a non-strided convolution (Conv) layer, and can be applied in most if not all CNN architectures. We explain this new design under two most representative computer vision tasks: object detection and image classification. We then create new CNN architectures by applying SPD-Conv to YOLOv5 and ResNet, and empirically show that our approach significantly outperforms state-of-the-art deep learning models, especially on tougher tasks with low-resolution images and small objects. We have open-sourced our code at https://github.com/LabSAINT/SPD-Conv.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2208.03641', 'place': '', 'date': '2022-08-07', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2208.03641', 'citationKey': '', 'url': 'http://arxiv.org/abs/2208.03641', 'accessDate': '2025-04-08T11:42:10Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'No More Strided Convolutions or Pooling', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2208.03641 [cs]\\nversion: 1', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-04-08T11:42:10Z', 'dateModified': '2025-04-08T11:42:10Z'}\n",
      "{'key': 'CTE7ZUJW', 'version': 5076, 'itemType': 'preprint', 'title': 'MobileMamba: Lightweight Multi-Receptive Visual Mamba Network', 'creators': [{'creatorType': 'author', 'firstName': 'Haoyang', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Jiangning', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yuxuan', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Hongxu', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiaobin', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Zhenye', 'lastName': 'Gan'}, {'creatorType': 'author', 'firstName': 'Yabiao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Chengjie', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yunsheng', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Xie'}], 'abstractNote': 'Previous research on lightweight models has primarily focused on CNNs and Transformer-based designs. CNNs, with their local receptive fields, struggle to capture long-range dependencies, while Transformers, despite their global modeling capabilities, are limited by quadratic computational complexity in high-resolution scenarios. Recently, state-space models have gained popularity in the visual domain due to their linear computational complexity. Despite their low FLOPs, current lightweight Mamba-based models exhibit suboptimal throughput. In this work, we propose the MobileMamba framework, which balances efficiency and performance. We design a three-stage network to enhance inference speed significantly. At a fine-grained level, we introduce the Multi-Receptive Field Feature Interaction(MRFFI) module, comprising the Long-Range Wavelet Transform-Enhanced Mamba(WTE-Mamba), Efficient Multi-Kernel Depthwise Convolution(MK-DeConv), and Eliminate Redundant Identity components. This module integrates multi-receptive field information and enhances high-frequency detail extraction. Additionally, we employ training and testing strategies to further improve performance and efficiency. MobileMamba achieves up to 83.6% on Top-1, surpassing existing state-of-the-art methods which is maximum x21 faster than LocalVim on GPU. Extensive experiments on high-resolution downstream tasks demonstrate that MobileMamba surpasses current efficient models, achieving an optimal balance between speed and accuracy.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2411.15941', 'place': '', 'date': '2024-11-24', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2411.15941', 'citationKey': '', 'url': 'http://arxiv.org/abs/2411.15941', 'accessDate': '2025-04-05T14:31:11Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'MobileMamba', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2411.15941 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-04-05T14:31:11Z', 'dateModified': '2025-04-05T14:31:11Z'}\n",
      "{'key': 'R85ZI9TC', 'version': 5074, 'itemType': 'journalArticle', 'title': 'MetaFormer Baselines for Vision', 'creators': [{'creatorType': 'author', 'firstName': 'Weihao', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Chenyang', 'lastName': 'Si'}, {'creatorType': 'author', 'firstName': 'Pan', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Mi', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Yichen', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Shuicheng', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Xinchao', 'lastName': 'Wang'}], 'abstractNote': \"MetaFormer, the abstracted architecture of Transformer, has been found to play a significant role in achieving competitive performance. In this paper, we further explore the capacity of MetaFormer, again, without focusing on token mixer design: we introduce several baseline models under MetaFormer using the most basic or common mixers, and summarize our observations as follows: (1) MetaFormer ensures solid lower bound of performance. By merely adopting identity mapping as the token mixer, the MetaFormer model, termed IdentityFormer, achieves >80% accuracy on ImageNet-1K. (2) MetaFormer works well with arbitrary token mixers. When specifying the token mixer as even a random matrix to mix tokens, the resulting model RandFormer yields an accuracy of >81%, outperforming IdentityFormer. Rest assured of MetaFormer's results when new token mixers are adopted. (3) MetaFormer effortlessly offers state-of-the-art results. With just conventional token mixers dated back five years ago, the models instantiated from MetaFormer already beat state of the art. (a) ConvFormer outperforms ConvNeXt. Taking the common depthwise separable convolutions as the token mixer, the model termed ConvFormer, which can be regarded as pure CNNs, outperforms the strong CNN model ConvNeXt. (b) CAFormer sets new record on ImageNet-1K. By simply applying depthwise separable convolutions as token mixer in the bottom stages and vanilla self-attention in the top stages, the resulting model CAFormer sets a new record on ImageNet-1K: it achieves an accuracy of 85.5% at 224x224 resolution, under normal supervised training without external data or distillation. In our expedition to probe MetaFormer, we also find that a new activation, StarReLU, reduces 71% FLOPs of activation compared with GELU yet achieves better performance. We expect StarReLU to find great potential in MetaFormer-like models alongside other neural networks.\", 'publicationTitle': 'IEEE Transactions on Pattern Analysis and Machine Intelligence', 'volume': '46', 'issue': '2', 'pages': '896-912', 'date': '2/2024', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'IEEE Trans. Pattern Anal. Mach. Intell.', 'language': '', 'DOI': '10.1109/TPAMI.2023.3329173', 'ISSN': '0162-8828, 2160-9292, 1939-3539', 'shortTitle': '', 'url': 'http://arxiv.org/abs/2210.13452', 'accessDate': '2025-04-03T05:46:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2210.13452 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['7R96H7YT'], 'relations': {}, 'dateAdded': '2025-04-03T05:46:10Z', 'dateModified': '2025-04-03T05:46:10Z'}\n",
      "{'key': '433NNZ3L', 'version': 5078, 'itemType': 'journalArticle', 'title': 'Omni-Kernel Network for Image Restoration', 'creators': [{'creatorType': 'author', 'firstName': 'Yuning', 'lastName': 'Cui'}, {'creatorType': 'author', 'firstName': 'Wenqi', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Alois', 'lastName': 'Knoll'}], 'abstractNote': 'Image restoration aims to reconstruct a high-quality image from a degraded low-quality observation. Recently, Transformer models have achieved promising performance on image restoration tasks due to their powerful ability to model long-range dependencies. However, the quadratically growing complexity with respect to the input size makes them inapplicable to practical applications. In this paper, we develop an efficient convolutional network for image restoration by enhancing multi-scale representation learning. To this end, we propose an omni-kernel module that consists of three branches, i.e., global, large, and local branches, to learn global-to-local feature representations efficiently. Specifically, the global branch achieves a global perceptive field via the dual-domain channel attention and frequency-gated mechanism. Furthermore, to provide multi-grained receptive fields, the large branch is formulated via different shapes of depth-wise convolutions with unusually large kernel sizes. Moreover, we complement local information using a point-wise depth-wise convolution. Finally, the proposed network, dubbed OKNet, is established by inserting the omni-kernel module into the bottleneck position for efficiency. Extensive experiments demonstrate that our network achieves state-of-the-art performance on 11 benchmark datasets for three representative image restoration tasks, including image dehazing, image desnowing, and image defocus deblurring. The code is available at https://github.com/c-yn/OKNet.', 'publicationTitle': 'Proceedings of the AAAI Conference on Artificial Intelligence', 'volume': '38', 'issue': '2', 'pages': '1426-1434', 'date': '2024-03-24', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.1609/aaai.v38i2.27907', 'ISSN': '2374-3468', 'shortTitle': '', 'url': 'https://ojs.aaai.org/index.php/AAAI/article/view/27907', 'accessDate': '2025-04-03T05:32:47Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ojs.aaai.org', 'callNumber': '', 'rights': 'Copyright (c) 2024 Association for the Advancement of Artificial Intelligence', 'extra': 'Number: 2', 'tags': [{'tag': 'CV: Representation Learning for Vision', 'type': 1}], 'collections': ['49A7H9UF'], 'relations': {}, 'dateAdded': '2025-04-03T05:32:47Z', 'dateModified': '2025-04-03T05:32:47Z'}\n",
      "{'key': 'ZHV7S6GP', 'version': 4694, 'itemType': 'preprint', 'title': 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces', 'creators': [{'creatorType': 'author', 'firstName': 'Albert', 'lastName': 'Gu'}, {'creatorType': 'author', 'firstName': 'Tri', 'lastName': 'Dao'}], 'abstractNote': 'Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers’ computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of efficient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5× higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2312.00752', 'place': '', 'date': '2024-05-31', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2312.00752', 'citationKey': '', 'url': 'http://arxiv.org/abs/2312.00752', 'accessDate': '2025-04-01T16:09:20Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Mamba', 'language': 'en', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2312.00752 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-01T16:09:20Z', 'dateModified': '2025-04-01T16:09:20Z'}\n",
      "{'key': 'FU5EPV4Q', 'version': 4689, 'itemType': 'preprint', 'title': 'Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model', 'creators': [{'creatorType': 'author', 'firstName': 'Lianghui', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Bencheng', 'lastName': 'Liao'}, {'creatorType': 'author', 'firstName': 'Qian', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xinlong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xinggang', 'lastName': 'Wang'}], 'abstractNote': 'Recently the state space models (SSMs) with efficient hardware-aware designs, i.e., the Mamba deep learning model, have shown great potential for long sequence modeling. Meanwhile building efficient and generic vision backbones purely upon SSMs is an appealing direction. However, representing visual data is challenging for SSMs due to the position-sensitivity of visual data and the requirement of global context for visual understanding. In this paper, we show that the reliance on self-attention for visual representation learning is not necessary and propose a new generic vision backbone with bidirectional Mamba blocks (Vim), which marks the image sequences with position embeddings and compresses the visual representation with bidirectional state space models. On ImageNet classification, COCO object detection, and ADE20k semantic segmentation tasks, Vim achieves higher performance compared to well-established vision transformers like DeiT, while also demonstrating significantly improved computation & memory efficiency. For example, Vim is 2.8$\\\\times$ faster than DeiT and saves 86.8% GPU memory when performing batch inference to extract features on images with a resolution of 1248$\\\\times$1248. The results demonstrate that Vim is capable of overcoming the computation & memory constraints on performing Transformer-style understanding for high-resolution images and it has great potential to be the next-generation backbone for vision foundation models. Code is available at https://github.com/hustvl/Vim.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2401.09417', 'place': '', 'date': '2024-11-14', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2401.09417', 'citationKey': '', 'url': 'http://arxiv.org/abs/2401.09417', 'accessDate': '2025-04-01T15:49:37Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Vision Mamba', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2401.09417 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-01T15:49:37Z', 'dateModified': '2025-04-01T15:49:37Z'}\n",
      "{'key': '6RVSER2X', 'version': 4685, 'itemType': 'preprint', 'title': 'Mamba YOLO: A Simple Baseline for Object Detection with State Space Model', 'creators': [{'creatorType': 'author', 'firstName': 'Zeyu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Chen', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Huiying', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Xinzhong', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Hongbo', 'lastName': 'Li'}], 'abstractNote': \"Driven by the rapid development of deep learning technology, the YOLO series has set a new benchmark for real-time object detectors. Additionally, transformer-based structures have emerged as the most powerful solution in the field, greatly extending the model's receptive field and achieving significant performance improvements. However, this improvement comes at a cost as the quadratic complexity of the self-attentive mechanism increases the computational burden of the model. To address this problem, we introduce a simple yet effective baseline approach called Mamba YOLO. Our contributions are as follows: 1) We propose that the ODMamba backbone introduce a \\\\textbf{S}tate \\\\textbf{S}pace \\\\textbf{M}odel (\\\\textbf{SSM}) with linear complexity to address the quadratic complexity of self-attention. Unlike the other Transformer-base and SSM-base method, ODMamba is simple to train without pretraining. 2) For real-time requirement, we designed the macro structure of ODMamba, determined the optimal stage ratio and scaling size. 3) We design the RG Block that employs a multi-branch structure to model the channel dimensions, which addresses the possible limitations of SSM in sequence modeling, such as insufficient receptive fields and weak image localization. This design captures localized image dependencies more accurately and significantly. Extensive experiments on the publicly available COCO benchmark dataset show that Mamba YOLO achieves state-of-the-art performance compared to previous methods. Specifically, a tiny version of Mamba YOLO achieves a \\\\textbf{7.5}\\\\% improvement in mAP on a single 4090 GPU with an inference time of \\\\textbf{1.5} ms. The pytorch code is available at: \\\\url{https://github.com/HZAI-ZJNU/Mamba-YOLO}\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2406.05835', 'place': '', 'date': '2024-12-14', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2406.05835', 'citationKey': '', 'url': 'http://arxiv.org/abs/2406.05835', 'accessDate': '2025-04-01T14:45:30Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Mamba YOLO', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2406.05835 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-01T14:45:30Z', 'dateModified': '2025-04-01T14:45:30Z'}\n",
      "{'key': 'T4AK5XSZ', 'version': 4682, 'itemType': 'preprint', 'title': 'Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation', 'creators': [{'creatorType': 'author', 'firstName': 'Ziyang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jian-Qing', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Yichi', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ge', 'lastName': 'Cui'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Li'}], 'abstractNote': \"In recent advancements in medical image analysis, Convolutional Neural Networks (CNN) and Vision Transformers (ViT) have set significant benchmarks. While the former excels in capturing local features through its convolution operations, the latter achieves remarkable global context understanding by leveraging self-attention mechanisms. However, both architectures exhibit limitations in efficiently modeling long-range dependencies within medical images, which is a critical aspect for precise segmentation. Inspired by the Mamba architecture, known for its proficiency in handling long sequences and global contextual information with enhanced computational efficiency as a State Space Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes the U-Net in medical image segmentation with Mamba's capability. Mamba-UNet adopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused with skip connections to preserve spatial information across different scales of the network. This design facilitates a comprehensive feature learning process, capturing intricate details and broader semantic contexts within medical images. We introduce a novel integration mechanism within the VMamba blocks to ensure seamless connectivity and information flow between the encoder and decoder paths, enhancing the segmentation performance. We conducted experiments on publicly available ACDC MRI Cardiac segmentation dataset, and Synapse CT Abdomen segmentation dataset. The results show that Mamba-UNet outperforms several types of UNet in medical image segmentation under the same hyper-parameter setting. The source code and baseline implementations are available.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2402.05079', 'place': '', 'date': '2024-03-30', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2402.05079', 'citationKey': '', 'url': 'http://arxiv.org/abs/2402.05079', 'accessDate': '2025-04-01T09:03:21Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Mamba-UNet', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2402.05079 [eess]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Electrical Engineering and Systems Science - Image and Video Processing', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-04-01T09:03:21Z', 'dateModified': '2025-04-01T09:03:21Z'}\n",
      "{'key': 'LMEWUFJI', 'version': 4675, 'itemType': 'preprint', 'title': 'Pinwheel-shaped Convolution and Scale-based Dynamic Loss for Infrared Small Target Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Jiangnan', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Shuangli', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jingjun', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Xinyu', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Nan', 'lastName': 'Hai'}, {'creatorType': 'author', 'firstName': 'Xueli', 'lastName': 'Huang'}], 'abstractNote': \"These recent years have witnessed that convolutional neural network (CNN)-based methods for detecting infrared small targets have achieved outstanding performance. However, these methods typically employ standard convolutions, neglecting to consider the spatial characteristics of the pixel distribution of infrared small targets. Therefore, we propose a novel pinwheel-shaped convolution (PConv) as a replacement for standard convolutions in the lower layers of the backbone network. PConv better aligns with the pixel Gaussian spatial distribution of dim small targets, enhances feature extraction, significantly increases the receptive field, and introduces only a minimal increase in parameters. Additionally, while recent loss functions combine scale and location losses, they do not adequately account for the varying sensitivity of these losses across different target scales, limiting detection performance on dim-small targets. To overcome this, we propose a scale-based dynamic (SD) Loss that dynamically adjusts the influence of scale and location losses based on target size, improving the network's ability to detect targets of varying scales. We construct a new benchmark, SIRST-UAVB, which is the largest and most challenging dataset to date for real-shot single-frame infrared small target detection. Lastly, by integrating PConv and SD Loss into the latest small target detection algorithms, we achieved significant performance improvements on IRSTD-1K and our SIRST-UAVB dataset, validating the effectiveness and generalizability of our approach. Code -- https://github.com/JN-Yang/PConv-SDloss-Data\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.16986', 'place': '', 'date': '2024-12-22', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.16986', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.16986', 'accessDate': '2025-03-28T14:49:51Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.16986 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-28T14:49:51Z', 'dateModified': '2025-03-28T14:50:18Z'}\n",
      "{'key': '9LQ27FYG', 'version': 4667, 'itemType': 'preprint', 'title': 'DuAT: Dual-Aggregation Transformer Network for Medical Image Segmentation', 'creators': [{'creatorType': 'author', 'firstName': 'Feilong', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Qiming', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Jinfeng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xianxu', 'lastName': 'Hou'}, {'creatorType': 'author', 'firstName': 'Jionglong', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Jingxin', 'lastName': 'Liu'}], 'abstractNote': 'Transformer-based models have been widely demonstrated to be successful in computer vision tasks by modelling long-range dependencies and capturing global representations. However, they are often dominated by features of large patterns leading to the loss of local details (e.g., boundaries and small objects), which are critical in medical image segmentation. To alleviate this problem, we propose a Dual-Aggregation Transformer Network called DuAT, which is characterized by two innovative designs, namely, the Global-to-Local Spatial Aggregation (GLSA) and Selective Boundary Aggregation (SBA) modules. The GLSA has the ability to aggregate and represent both global and local spatial features, which are beneficial for locating large and small objects, respectively. The SBA module is used to aggregate the boundary characteristic from low-level features and semantic information from high-level features for better preserving boundary details and locating the re-calibration objects. Extensive experiments in six benchmark datasets demonstrate that our proposed model outperforms state-of-the-art methods in the segmentation of skin lesion images, and polyps in colonoscopy images. In addition, our approach is more robust than existing methods in various challenging situations such as small object segmentation and ambiguous object boundaries.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2212.11677', 'place': '', 'date': '2022-12-21', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2212.11677', 'citationKey': '', 'url': 'http://arxiv.org/abs/2212.11677', 'accessDate': '2025-03-21T01:18:51Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DuAT', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2212.11677 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-21T01:18:51Z', 'dateModified': '2025-03-21T01:18:51Z'}\n",
      "{'key': 'NWR98SQ5', 'version': 5067, 'itemType': 'preprint', 'title': 'Mamba YOLO: A Simple Baseline for Object Detection with State Space Model', 'creators': [{'creatorType': 'author', 'firstName': 'Zeyu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Chen', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Huiying', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Xinzhong', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Hongbo', 'lastName': 'Li'}], 'abstractNote': \"Driven by the rapid development of deep learning technology, the YOLO series has set a new benchmark for real-time object detectors. Additionally, transformer-based structures have emerged as the most powerful solution in the field, greatly extending the model's receptive field and achieving significant performance improvements. However, this improvement comes at a cost as the quadratic complexity of the self-attentive mechanism increases the computational burden of the model. To address this problem, we introduce a simple yet effective baseline approach called Mamba YOLO. Our contributions are as follows: 1) We propose that the ODMamba backbone introduce a \\\\textbf{S}tate \\\\textbf{S}pace \\\\textbf{M}odel (\\\\textbf{SSM}) with linear complexity to address the quadratic complexity of self-attention. Unlike the other Transformer-base and SSM-base method, ODMamba is simple to train without pretraining. 2) For real-time requirement, we designed the macro structure of ODMamba, determined the optimal stage ratio and scaling size. 3) We design the RG Block that employs a multi-branch structure to model the channel dimensions, which addresses the possible limitations of SSM in sequence modeling, such as insufficient receptive fields and weak image localization. This design captures localized image dependencies more accurately and significantly. Extensive experiments on the publicly available COCO benchmark dataset show that Mamba YOLO achieves state-of-the-art performance compared to previous methods. Specifically, a tiny version of Mamba YOLO achieves a \\\\textbf{7.5}\\\\% improvement in mAP on a single 4090 GPU with an inference time of \\\\textbf{1.5} ms. The pytorch code is available at: \\\\url{https://github.com/HZAI-ZJNU/Mamba-YOLO}\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2406.05835', 'place': '', 'date': '2024-12-14', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2406.05835', 'citationKey': '', 'url': 'http://arxiv.org/abs/2406.05835', 'accessDate': '2025-03-19T03:53:20Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Mamba YOLO', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2406.05835 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['49A7H9UF'], 'relations': {}, 'dateAdded': '2025-03-19T03:53:20Z', 'dateModified': '2025-03-19T03:53:20Z'}\n",
      "{'key': 'UUJUL7CL', 'version': 5046, 'itemType': 'preprint', 'title': 'DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Shilong', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Hang', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Jun', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Lionel M.', 'lastName': 'Ni'}, {'creatorType': 'author', 'firstName': 'Heung-Yeung', 'lastName': 'Shum'}], 'abstractNote': 'We present DINO (\\\\textbf{D}ETR with \\\\textbf{I}mproved de\\\\textbf{N}oising anch\\\\textbf{O}r boxes), a state-of-the-art end-to-end object detector. % in this paper. DINO improves over previous DETR-like models in performance and efficiency by using a contrastive way for denoising training, a mixed query selection method for anchor initialization, and a look forward twice scheme for box prediction. DINO achieves $49.4$AP in $12$ epochs and $51.3$AP in $24$ epochs on COCO with a ResNet-50 backbone and multi-scale features, yielding a significant improvement of $\\\\textbf{+6.0}$\\\\textbf{AP} and $\\\\textbf{+2.7}$\\\\textbf{AP}, respectively, compared to DN-DETR, the previous best DETR-like model. DINO scales well in both model size and data size. Without bells and whistles, after pre-training on the Objects365 dataset with a SwinL backbone, DINO obtains the best results on both COCO \\\\texttt{val2017} ($\\\\textbf{63.2}$\\\\textbf{AP}) and \\\\texttt{test-dev} (\\\\textbf{$\\\\textbf{63.3}$AP}). Compared to other models on the leaderboard, DINO significantly reduces its model size and pre-training data size while achieving better results. Our code will be available at \\\\url{https://github.com/IDEACVR/DINO}.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2203.03605', 'place': '', 'date': '2022-07-11', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2203.03605', 'citationKey': '', 'url': 'http://arxiv.org/abs/2203.03605', 'accessDate': '2025-03-12T03:49:04Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DINO', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2203.03605 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['4KEZKRYY'], 'relations': {}, 'dateAdded': '2025-03-12T03:49:04Z', 'dateModified': '2025-03-12T03:49:05Z'}\n",
      "{'key': 'EWBTRN8G', 'version': 4511, 'itemType': 'journalArticle', 'title': 'Optimization and Validation of Wafer Surface Defect Detection Algorithm Based on RT-DETR', 'creators': [{'creatorType': 'author', 'firstName': 'Ao', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yanwei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Hongbo', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Rui', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Jianjie', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jiaying', 'lastName': 'Wang'}], 'abstractNote': 'In response to the issue of poor detection performance on wafer surface defect spots and elongated scratches, an improved RT-DETR method for wafer surface defect detection is proposed. Firstly, a dynamic snake convolutional layer is introduced to detect elongated scratches where conventional convolutional kernels fail to extract features effectively. Secondly, to address the problem of information loss in small targets, an attention-based Transformer encoder module and a feature fusion network based on residual thinking are proposed. Finally, verification is conducted using a wafer test dataset. Experimental results demonstrate that compared to the original RT-DETR method, the model exhibits a 4.1% improvement in detecting small particles and a 5.4% improvement in scratch detection performance. Fully meeting the requirements of intelligent manufacturing and high detection accuracy.', 'publicationTitle': 'IEEE Access', 'volume': '13', 'issue': '', 'pages': '39727-39737', 'date': '2025', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/ACCESS.2025.3543525', 'ISSN': '2169-3536', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10892113/?arnumber=10892113', 'accessDate': '2025-03-09T14:28:42Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Access', 'tags': [{'tag': 'Accuracy', 'type': 1}, {'tag': 'Computational modeling', 'type': 1}, {'tag': 'Data mining', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Defects detection', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Kernel', 'type': 1}, {'tag': 'Semiconductor device modeling', 'type': 1}, {'tag': 'Shape', 'type': 1}, {'tag': 'Standards', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'deep learning', 'type': 1}, {'tag': 'object detection', 'type': 1}], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-09T14:28:42Z', 'dateModified': '2025-03-09T14:28:42Z'}\n",
      "{'key': 'Y3SS2BQQ', 'version': 4506, 'itemType': 'journalArticle', 'title': 'Foreign Object Shading Detection in Photovoltaic Modules Based on Transfer Learning', 'creators': [{'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Qingda', 'lastName': 'Kong'}, {'creatorType': 'author', 'firstName': 'Hongyu', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Dongdong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Hui Hwang', 'lastName': 'Goh'}, {'creatorType': 'author', 'firstName': 'Thomas', 'lastName': 'Wu'}], 'abstractNote': 'As a representative new energy source, solar energy has the advantages of easy access to resources and low pollution. However, due to the uncertainty of the external environment, photovoltaic (PV) modules that collect solar energy are often covered by foreign objects in the environment such as leaves and bird droppings, resulting in a decrease in photoelectric conversion efficiency, power losses, and even the “hot spot” phenomenon, resulting in damage to the modules. Existing methods mostly inspect foreign objects manually, which not only incurs high labor costs but also hinders real-time monitoring. To address these problems, this paper proposes an IDETR deep learning target detection model based on Deformable DETR combined with transfer learning and a convolutional block attention module, which can identify foreign object shading on the surfaces of PV modules in actual operating environments. This study contributes to the optimal operation and maintenance of PV systems. In addition, this paper collects data in the field and constructs a dataset of foreign objects of PV modules. The results show that the advanced model can significantly improve the target detection AP values.', 'publicationTitle': 'Energies', 'volume': '16', 'issue': '7', 'pages': '2996', 'date': '2023/1', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3390/en16072996', 'ISSN': '1996-1073', 'shortTitle': '', 'url': 'https://www.mdpi.com/1996-1073/16/7/2996', 'accessDate': '2025-03-09T14:12:09Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.mdpi.com', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/by/3.0/', 'extra': 'Number: 7\\nPublisher: Multidisciplinary Digital Publishing Institute', 'tags': [{'tag': 'convolutional block attention module', 'type': 1}, {'tag': 'foreign object shading detection', 'type': 1}, {'tag': 'photovoltaic module', 'type': 1}, {'tag': 'transfer learning', 'type': 1}], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-09T14:12:09Z', 'dateModified': '2025-03-09T14:12:09Z'}\n",
      "{'key': 'LJVEFFWE', 'version': 4505, 'itemType': 'journalArticle', 'title': 'PD-DETR: towards efficient parallel hybrid matching with transformer for photovoltaic cell defects detection', 'creators': [{'creatorType': 'author', 'firstName': 'Langyue', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Yiquan', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Yubin', 'lastName': 'Yuan'}], 'abstractNote': 'Defect detection for photovoltaic (PV) cell images is a challenging task due to the small size of the defect features and the complexity of the background characteristics. Modern detectors rely mostly on proxy learning objectives for prediction and on manual post-processing components. One-to-one set matching is a critical design for DEtection TRansformer (DETR) in order to provide end-to-end capability, so that does not need a hand-crafted Efficient Non-Maximum Suppression NMS. In order to detect PV cell defects faster and better, a technology called the PV cell Defects DEtection Transformer (PD-DETR) is proposed. To address the issue of slow convergence caused by DETR’s direct translation of image feature mapping into target detection results, we created a hybrid feature module. To achieve a balance between performance and computation, the image features are passed through a scoring network and dilated convolution, respectively, to obtain the foreground fine feature and contour high-frequency feature. The two features are then adaptively intercepted and fused. The capacity of the model to detect small-scale defects under complex background conditions is improved by the addition of high-frequency information. Furthermore, too few positive queries will be assigned to the defect target via one-to-one set matching, which will result in sparse supervision of the encoder and impair the decoder’s ability of attention learning. Consequently, we enhanced the detection effect by combining the original DETR with the one-to-many matching branch. Specifically, two Faster RCNN detection heads were added during training. To maintain the end-to-end benefits of DETR, inference is still performed using the original one-to-one set matching. Our model implements 64.7% AP on the PVEL-AD dataset.', 'publicationTitle': 'Complex & Intelligent Systems', 'volume': '10', 'issue': '6', 'pages': '7421-7434', 'date': '2024-12-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Complex Intell. Syst.', 'language': 'en', 'DOI': '10.1007/s40747-024-01559-0', 'ISSN': '2198-6053', 'shortTitle': 'PD-DETR', 'url': 'https://doi.org/10.1007/s40747-024-01559-0', 'accessDate': '2025-03-07T04:23:02Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Springer Link', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'DETR', 'type': 1}, {'tag': 'Fine feature', 'type': 1}, {'tag': 'High-frequency feature', 'type': 1}, {'tag': 'One-to-many set matching', 'type': 1}, {'tag': 'PV cell defects', 'type': 1}], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-07T04:23:02Z', 'dateModified': '2025-03-09T14:06:32Z'}\n",
      "{'key': 'S3JTSG3J', 'version': 4504, 'itemType': 'journalArticle', 'title': 'LW-PV DETR: lightweight model for photovoltaic panel surface defect detection', 'creators': [{'creatorType': 'author', 'firstName': 'Tao', 'lastName': 'Han'}, {'creatorType': 'author', 'firstName': 'Meiping', 'lastName': 'Bao'}, {'creatorType': 'author', 'firstName': 'Tao', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Rui', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xi', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Yourui', 'lastName': 'Huang'}], 'abstractNote': 'The photovoltaic industry is developing rapidly, and efficiently completing the operation and maintenance of photovoltaic systems has become a research hotspot, with photovoltaic panel defect detection being particularly critical. Due to factors such as the complex background of infrared images of photovoltaic panels taken by drones, the small proportion of defect areas, and equipment limitations, existing models face challenges in detection accuracy and deployment. Aiming at the three typical defects commonly found on the surface of photovoltaic (PV) panels, namely, shading, glass breakage and hot spots, a surface defect detection model (LW-PV DETR) for photovoltaic panels is proposed based on the Real-Time DEtection TRansformer (RT-DETR-R18) object detection model. In the backbone network, a lightweight and efficient attention feature extraction module (Faster-Rep-EMA Block) is designed to enhance the model feature extraction ability. In the Encoder, the lightweight convolution (GSConv) module is introduced to achieve model lightweighting. The feature focusing diffusion pyramid network (FFDPN) is proposed to enhance the model’s feature fusion capability. Simultaneously, to avoid the loss of small object features, a multi-level feature selective fusion (MLFSF) module is designed for feature focusing. For the loss function, Inner-IoU is introduced to improve the localization accuracy of bounding box regression. Experimental results on the public photovoltaic panel infrared image dataset GB_HSP_modified show that, compared to the baseline model, LW-PV DETR improves precision, recall, and mean Average Precision (mAP50, mAP50-95) by 3.9%, 18.6%, 18.5% and 10.9%, respectively, while the model’s parameter count is reduced by 11.83%. Compared to other mainstream object detection models, LW-PV DETR also demonstrates excellent detection performance, providing an important reference for research on intelligent detection of photovoltaic panel surface defects.', 'publicationTitle': 'Engineering Research Express', 'volume': '7', 'issue': '1', 'pages': '015357', 'date': '2025-02', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Eng. Res. Express', 'language': 'en', 'DOI': '10.1088/2631-8695/adb4be', 'ISSN': '2631-8695', 'shortTitle': 'LW-PV DETR', 'url': 'https://dx.doi.org/10.1088/2631-8695/adb4be', 'accessDate': '2025-03-07T07:40:08Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Institute of Physics', 'callNumber': '', 'rights': '', 'extra': 'Publisher: IOP Publishing', 'tags': [], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-07T07:40:08Z', 'dateModified': '2025-03-09T14:06:19Z'}\n",
      "{'key': 'Y3Q298W3', 'version': 4493, 'itemType': 'conferencePaper', 'title': 'PV-DETR: A Multimodal Fault Detection Model of PV Arrays based on Parallel Block Attention', 'creators': [{'creatorType': 'author', 'firstName': 'Wanghu', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yihua', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Long', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Li'}], 'abstractNote': 'Detection of faults in photovoltaic arrays can reduce power generation losses and extend the equipment’s lifespan. Traditional operation and maintenance of photovoltaic power stations primarily rely on electrical characteristics or infrared images. However, data from a single modality are susceptible to environmental interference, affecting detection accuracy. To address these issues, we propose a model called PV-DETR for fault detection in photovoltaic arrays under complex environmental conditions. This model is an extension of RT-DETRv2, which leverages the Transformer architecture for feature extraction and decoding. The model employs a PResNet50 module instead of the original ResNet50, along with haar wavelet downsampling and a parallel block attention mechanism. The PResNet50 module can reduce dimensionality while minimizing information loss. Haar wavelet downsampling retains the original global information and compresses feature maps effectively, and the parallel block attention mechanism significantly enhances the detection of small infrared targets. Experimental results show that the final PV-DETR model achieves an average accuracy of 89% and an average recall of 85% in fault detection using multimodal data, outperforming existing models, including the original RT-DETRv2.', 'date': '2024-12', 'proceedingsTitle': '2024 IEEE International Conference on Big Data (BigData)', 'conferenceName': '2024 IEEE International Conference on Big Data (BigData)', 'place': '', 'publisher': '', 'volume': '', 'pages': '3609-3615', 'series': '', 'language': '', 'DOI': '10.1109/BigData62323.2024.10826130', 'ISBN': '', 'shortTitle': 'PV-DETR', 'url': 'https://ieeexplore.ieee.org/document/10826130/?arnumber=10826130', 'accessDate': '2025-03-09T14:03:01Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2573-2978', 'tags': [{'tag': 'Arrays', 'type': 1}, {'tag': 'Computational modeling', 'type': 1}, {'tag': 'Data models', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Image coding', 'type': 1}, {'tag': 'Maintenance', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'RT-DETRv2', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'Wavelet transforms', 'type': 1}, {'tag': 'fault detection', 'type': 1}, {'tag': 'infrared images', 'type': 1}, {'tag': 'multimodal', 'type': 1}, {'tag': 'photovoltaic array', 'type': 1}], 'collections': ['4BGL6XIQ'], 'relations': {}, 'dateAdded': '2025-03-09T14:03:01Z', 'dateModified': '2025-03-09T14:03:01Z'}\n",
      "{'key': 'SED5DETK', 'version': 4477, 'itemType': 'journalArticle', 'title': 'A novel cost-function for transformerbased YOLO algorithm to detect photovoltaic panel defects', 'creators': [{'creatorType': 'author', 'firstName': 'Hambal', 'lastName': 'Tella'}, {'creatorType': 'author', 'firstName': 'Mohamed', 'lastName': 'Mohandes'}, {'creatorType': 'author', 'firstName': 'B.', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Ali', 'lastName': 'Al-Shaikhi'}, {'creatorType': 'author', 'firstName': 'Shafiqur', 'lastName': 'Rehman'}], 'abstractNote': 'Solar panel defects can lead to substantial efficiency loss and increased maintenance expenses. Conventional defect detection methods are often slow and ineffective. Thisstudy revisits the You Only Look Once (YOLO) algorithm and its variations, assessing their efficacy in identifying defects in thermal images of solar panels. Subsequently, we introduce a novel YOLO algorithm, termed YOLOS-PV, built uponthe transformer-based YOLOS algorithm. The proposed algorithm introduces newloss function weights to prioritize localized objects and visualize the attention mapof each transformer head within the YOLOS algorithm. In the experiments, theYOLOS-PV achieves a mAP@0.5:0.95 score of 0.894, surpassing the efficiency ofother YOLO variants. Code implementation can be found here: tella26/YOLOS-PV (github.com).', 'publicationTitle': 'FME Transactions', 'volume': '52', 'issue': '4', 'pages': '639-646', 'date': '2024', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'FME Transactions', 'language': 'en', 'DOI': '10.5937/fme2404639T', 'ISSN': '1451-2092, 2406-128X', 'shortTitle': '', 'url': 'https://scindeks.ceon.rs/Article.aspx?artid=1451-20922404639T', 'accessDate': '2025-03-09T13:36:53Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/BY/4.0', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:36:53Z', 'dateModified': '2025-03-09T13:36:53Z'}\n",
      "{'key': 'MEWQIKTH', 'version': 4473, 'itemType': 'conferencePaper', 'title': 'AI-Based PV Panels Inspection using an Advanced YOLO Algorithm', 'creators': [{'creatorType': 'author', 'firstName': 'Agus', 'lastName': 'Haeruman'}], 'abstractNote': 'The rapid growth of solar photovoltaic (PV) systems as green energy sources has gained momentum in recent years. However, the anomalies of PV panel defects can reduce its efficiency and minimize energy harvesting from the plant. The manual inspection of PV panel defects throughout the plant is costly and time-consuming. Thus, implementing more intelligent ways to inspect solar panel defects will provide more benefits than traditional ones. This study presents an implementation of a deep learning model to detect solar panel defects using an advanced object detection algorithm called You Look Only Once, version 7 (YOLOv7). YOLO is a popular algorithm in computer vision for classification and localization. The dataset utilized in this study was sourced from ROBOFLOW, consisting of 1660 infrared images showcasing thermal defects in PV panels. The model was constructed to identify a broader range of images with heterogeneity, leveraging the aforementioned dataset. Following validation, the model demonstrates a mean Average Precision (mAP) of 85.9%. With this accuracy, the model is relevant for real-world applications. This assertion is affirmed by testing the model with additional data from separate video-capturing PV panels. The video was recorded using a drone equipped with a thermal camera.', 'date': '2024-08-15', 'proceedingsTitle': '', 'conferenceName': 'Renewable Energy: Generation and Application', 'place': '', 'publisher': '', 'volume': '', 'pages': '230-237', 'series': '', 'language': 'en', 'DOI': '10.21741/9781644903216-30', 'ISBN': '', 'shortTitle': '', 'url': 'https://www.mrforum.com/product/9781644903216-30', 'accessDate': '2025-03-09T13:34:35Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:34:35Z', 'dateModified': '2025-03-09T13:34:35Z'}\n",
      "{'key': 'FNHQMRHV', 'version': 4465, 'itemType': 'journalArticle', 'title': 'CCA-YOLO: Channel and Coordinate Aware-Based YOLO for Photovoltaic Cell Defect Detection in Electroluminescence Images', 'creators': [{'creatorType': 'author', 'firstName': 'Junqi', 'lastName': 'Bao'}, {'creatorType': 'author', 'firstName': 'Xiaochen', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Qingying', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Chan-Tong', 'lastName': 'Lam'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Ke'}, {'creatorType': 'author', 'firstName': 'Ping', 'lastName': 'Li'}], 'abstractNote': 'Solar energy is a renewable energy used for urban power generation, contributing to sustainable cities. In solar energy generation, it is important to inspect the health of photovoltaic (PV) cells for safety and power transformation efficiency. Defects in PV cells are usually irregular with different scales, challenging automated defect detection for PV cells. Therefore, this article presents a channel and coordinate aware-based YOLO (CCA-YOLO) for efficient PV cell defect detection. Specifically, to provide accurate backbone features from the complex background defect images, the residual coordinate convolution-based ECA (RCC-ECA) enhances the backbone feature representation by learning from channel and coordinate information. To learn the intraclass/interclass variations and interclass similarity and convey coordinate information among different scales, the multiscale defect feature localization module (MDFLM) incorporates a larger backbone feature to improve the robustness of multiscale defects. The RCC-Up/Down optimizes the sampled features to minimize the inaccurate representation of the features caused by the sampling process. In addition, RCC-Up/Down conveys the coordinate information during the up/down sampling process to maintain coordinate awareness, which allows the network to learn from the coordinate information efficiently. Furthermore, the residual feature fusion with coordinate convolution-based CBAM (RFC-CBAM) is introduced to maintain the channel and coordinate awareness for efficient learning from fused features. The proposed CCA-YOLO outperforms state-of-the-art (SOTA) methods in PVEL-AD on precision (71.71%), recall (76.91%), F1-Scores (74.19%), mAP50 (98.57%), \\\\text AP_S (26.80%), \\\\text AP_M (64.78%), and \\\\text AP_L (74.93%).', 'publicationTitle': 'IEEE Transactions on Instrumentation and Measurement', 'volume': '74', 'issue': '', 'pages': '1-12', 'date': '2025', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TIM.2025.3541805', 'ISSN': '1557-9662', 'shortTitle': 'CCA-YOLO', 'url': 'https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber=10884963&ref=', 'accessDate': '2025-03-09T13:31:51Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Instrumentation and Measurement', 'tags': [{'tag': 'Accuracy', 'type': 1}, {'tag': 'Computer architecture', 'type': 1}, {'tag': 'Convolutional neural networks', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Industries', 'type': 1}, {'tag': 'Photovoltaic cells', 'type': 1}, {'tag': 'Shape', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'YOLO', 'type': 1}, {'tag': 'defect detection', 'type': 1}, {'tag': 'electroluminescence images', 'type': 1}, {'tag': 'photovoltaic (PV) cell', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:31:51Z', 'dateModified': '2025-03-09T13:31:51Z'}\n",
      "{'key': 'RMVBASQN', 'version': 4457, 'itemType': 'journalArticle', 'title': 'A novel object recognition method for photovoltaic (PV) panel occlusion based on deep learning', 'creators': [{'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Rongqiang', 'lastName': 'Guan'}, {'creatorType': 'author', 'firstName': 'Cungui', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Fang', 'lastName': 'Shao'}], 'abstractNote': 'During the long-term operation of the photovoltaic (PV) system, occlusion will reduce the solar radiation energy received by the PV module, as well as the photoelectric conversion efficiency and economy. However, the occlusion detection of the PV power station has the defects of low efficiency, poor accuracy, and untimely detection, which will cause unknown system losses. Based on the deep learning algorithm, this paper conducts research on PV module occlusion detection. In order to accurately obtain the occlusion area and position information of the PV panel, a PV module occlusion detection model based on the Segment-You Only Look Once (Seg-YOLO) algorithm is established. Based on the YOLOv5 algorithm, the loss function is modified, the Segment Head detection module is introduced, and the convolutional block attention module (CBAM) attention mechanism is added to achieve the accurate detection of small targets by the algorithm model and the fast detection of the PV module occlusion area identify. The model performance research is carried out on three types of occlusion datasets: leaf, bird dropping, and shadow. According to the experimental results, the proposed model has better recognition accuracy and speed than SSD, Faster-Rcnn, YOLOv4, and U-Net. The precision rate, recall rate, and recognition speed can reach 90.52%, 92.41%, and 92.3 FPS, respectively. This model can lay a theoretical foundation for the intelligent operation and maintenance of PV systems.', 'publicationTitle': 'Journal of Computational Methods in Sciences and Engineering', 'volume': '23', 'issue': '6', 'pages': '3391-3405', 'date': '2023-11-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3233/JCM-237108', 'ISSN': '1472-7978', 'shortTitle': '', 'url': 'https://journals.sagepub.com/doi/abs/10.3233/JCM-237108', 'accessDate': '2025-03-09T13:30:04Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'SAGE Journals', 'callNumber': '', 'rights': '', 'extra': 'Publisher: SAGE Publications', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:30:04Z', 'dateModified': '2025-03-09T13:30:04Z'}\n",
      "{'key': '9NL7WXAL', 'version': 4454, 'itemType': 'journalArticle', 'title': 'Solar panel defect detection design based on YOLO v5 algorithm', 'creators': [{'creatorType': 'author', 'firstName': 'Jing', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Keyao', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Zijun', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Wanhan', 'lastName': 'Zhong'}], 'abstractNote': '', 'publicationTitle': 'Heliyon', 'volume': '9', 'issue': '8', 'pages': '', 'date': '2023-08-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Heliyon', 'language': 'English', 'DOI': '10.1016/j.heliyon.2023.e18826', 'ISSN': '2405-8440', 'shortTitle': '', 'url': 'https://www.cell.com/heliyon/abstract/S2405-8440(23)06034-6', 'accessDate': '2025-03-09T13:25:27Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.cell.com', 'callNumber': '', 'rights': '', 'extra': 'Publisher: Elsevier', 'tags': [{'tag': 'Defect detection', 'type': 1}, {'tag': 'Electrical safety', 'type': 1}, {'tag': 'Solar panels', 'type': 1}, {'tag': 'YOLO v5', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:25:27Z', 'dateModified': '2025-03-09T13:25:27Z'}\n",
      "{'key': 'W986MNFK', 'version': 4450, 'itemType': 'conferencePaper', 'title': 'Towards Efficient Solar Panel Inspection: A YOLO-based Method for Hotspot Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Muhammad Irshat', 'lastName': 'Ameerdin'}, {'creatorType': 'author', 'firstName': 'Muhammad Herman', 'lastName': 'Jamaluddin'}, {'creatorType': 'author', 'firstName': 'Ahmad Zaki', 'lastName': 'Shukor'}, {'creatorType': 'author', 'firstName': 'Luqman', 'lastName': 'Al Hakim Kamaruzaman'}, {'creatorType': 'author', 'firstName': 'Syazwani', 'lastName': 'Mohamad'}], 'abstractNote': \"Solar energy that captured by the photovoltaic (PV) cells has gained recognition as an important factor in the global search for sustainable and clean energy sources in recent years. One of the Sustainable Development Goals (SDG) that solar technology directly supports is Affordable and Clean Energy. It can help increase access to clean energy sources by improving the efficiency and dependability of solar panels through minimizing its defects. However, a variety of defects can shorten the lifespan and effectiveness of PV array, which are crucial components of solar energy systems. The study concentrates on detecting hotspots on solar panels, identifiable through thermal imaging technology. This project aims to develop a deep learning-based approach for defect detection of solar panels. The project unfolds with a primary goal, that is designing the integration of a thermal sensor and deep learning to detect and identify defects in PV panels. It follows with crafting a robust algorithm within the deep learning environment for effective defect detection and identification. Next, the algorithm's performance will be evaluated, emphasizing its reliability and accuracy in enhancing defect detection. The process begins with physically examining a solar panel, followed by using a drone-mounted thermal camera to capture thermal images. After obtaining enough data, the images undergo model generation by labelling and annotation process using Roboflow. The model is then tested and trained for defect detection using YOLOv8. Once the desired accuracy is reached, the dataset is formatted. A user-friendly graphical interface is developed for ease of interaction. Then, the system's performance is evaluated using a confusion matrix to gauge the effectiveness of the defect detection approach. The panel's defect will be confirmed with the manual inspection. Based on the early result obtained, the model's confidence level that has been acquired is 76%.\", 'date': '2024-05', 'proceedingsTitle': '2024 IEEE 14th Symposium on Computer Applications & Industrial Electronics (ISCAIE)', 'conferenceName': '2024 IEEE 14th Symposium on Computer Applications & Industrial Electronics (ISCAIE)', 'place': '', 'publisher': '', 'volume': '', 'pages': '367-372', 'series': '', 'language': '', 'DOI': '10.1109/ISCAIE61308.2024.10576312', 'ISBN': '', 'shortTitle': 'Towards Efficient Solar Panel Inspection', 'url': 'https://ieeexplore.ieee.org/abstract/document/10576312', 'accessDate': '2025-03-09T13:23:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2836-4317', 'tags': [{'tag': 'Accuracy', 'type': 1}, {'tag': 'Deep Learning', 'type': 1}, {'tag': 'Deep learning', 'type': 1}, {'tag': 'Hotspots', 'type': 1}, {'tag': 'Refining', 'type': 1}, {'tag': 'Roboflow', 'type': 1}, {'tag': 'Solar Panel', 'type': 1}, {'tag': 'Solar energy', 'type': 1}, {'tag': 'System performance', 'type': 1}, {'tag': 'Thermal sensors', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'YOLO', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:23:00Z', 'dateModified': '2025-03-09T13:23:00Z'}\n",
      "{'key': 'YVKIVCSB', 'version': 4447, 'itemType': 'journalArticle', 'title': 'ST-YOLO: A defect detection method for photovoltaic modules based on infrared thermal imaging and machine vision technology', 'creators': [{'creatorType': 'author', 'firstName': 'Hanfei', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Baoxi', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Chengyu', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Yujie', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Chunlan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yuqian', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Peng', 'lastName': 'Chu'}], 'abstractNote': 'Photovoltaic panels are the core components of photovoltaic power generation systems, and their quality directly affects power generation efficiency and circuit safety. To address the shortcomings of existing photovoltaic defect detection technologies, such as high labor costs, large workloads, high sensor failure rates, low reliability, high false alarm rates, high network demands, and slow detection speeds of traditional algorithms, we propose an algorithm named ST-YOLO specifically for photovoltaic module defect detection. This algorithm is based on YOLOv8s. First, it introduces the C2f-SCconv convolution module, which is based on SCconv convolution. This module reduces the computational burden of model parameters and improves detection speed through lightweight design. Additionally, the Triplet Attention mechanism is incorporated, significantly enhancing detection accuracy without substantially increasing model parameter computations. Experiments on a self-built photovoltaic array infrared defect image dataset show that ST-YOLO, compared to the baseline YOLOv8s, achieves a 15% reduction in model weight, a 2.9% improvement in Precision, and a 1.4% increase in mAP@0.5. Compared to YOLOv7-Tiny and YOLOv5s, ST-YOLO also demonstrates superior detection performance and advantages. This indicates that ST-YOLO has significant application value in photovoltaic defect detection.', 'publicationTitle': 'PLOS ONE', 'volume': '19', 'issue': '12', 'pages': 'e0310742', 'date': '2024年12月12日', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'PLOS ONE', 'language': 'en', 'DOI': '10.1371/journal.pone.0310742', 'ISSN': '1932-6203', 'shortTitle': 'ST-YOLO', 'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0310742', 'accessDate': '2025-03-09T13:17:13Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'PLoS Journals', 'callNumber': '', 'rights': '', 'extra': 'Publisher: Public Library of Science', 'tags': [{'tag': 'Algorithms', 'type': 1}, {'tag': 'Alternative energy', 'type': 1}, {'tag': 'Convolution', 'type': 1}, {'tag': 'Imaging techniques', 'type': 1}, {'tag': 'Machine learning algorithms', 'type': 1}, {'tag': 'Photovoltaic power', 'type': 1}, {'tag': 'Power stations', 'type': 1}, {'tag': 'Sunlight', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:17:13Z', 'dateModified': '2025-03-09T13:17:13Z'}\n",
      "{'key': 'XEHPWCYJ', 'version': 4442, 'itemType': 'journalArticle', 'title': 'PA-YOLO-Based Multifault Defect Detection Algorithm for PV Panels', 'creators': [{'creatorType': 'author', 'firstName': 'Wang', 'lastName': 'Yin'}, {'creatorType': 'author', 'firstName': 'Zhao', 'lastName': 'Jingyong'}, {'creatorType': 'author', 'firstName': 'Xie', 'lastName': 'Gang'}, {'creatorType': 'author', 'firstName': 'Zhao', 'lastName': 'Zhicheng'}, {'creatorType': 'author', 'firstName': 'Hu', 'lastName': 'Xiao'}], 'abstractNote': 'In recent years, solar photovoltaic (PV) energy, as a clean energy source, has received widespread attention and experienced rapid growth worldwide. However, the rapid growth of PV power deployment also brings important challenges to the maintenance of PV panels, and in order to solve this problem, this paper proposes an innovative algorithm based on PA-YOLO. First, we propose to use PA-YOLO’s asymptotic feature pyramid network (AFPN) instead of YOLOv7’s backbone network to support direct interactions of nonadjacent layers and avoid large semantic gaps between nonadjacent layers. For the occlusion problem of dense targets in the dataset, we introduce a repulsive loss function, which successfully reduces the occurrence of false detection situations. Finally, we propose a customized convolutional block equipped with an EMA mechanism to enhance the perceptual and expressive capabilities of the model. Experimental results on the dataset show that our proposed model achieves excellent performance with an average accuracy (mAP) of 94.5%, which is 6.8% higher than YOLOv7. In addition, our algorithm also succeeds in drastically reducing the model size from 71.3 MB to 48.4 MB, which well demonstrates the effectiveness of the model.', 'publicationTitle': 'International Journal of Photoenergy', 'volume': '2024', 'issue': '1', 'pages': '6113260', 'date': '2024', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.1155/2024/6113260', 'ISSN': '1687-529X', 'shortTitle': '', 'url': 'https://onlinelibrary.wiley.com/doi/abs/10.1155/2024/6113260', 'accessDate': '2025-03-09T13:02:15Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Wiley Online Library', 'callNumber': '', 'rights': 'Copyright © 2024 Wang Yin et al.', 'extra': '_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1155/2024/6113260', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T13:02:15Z', 'dateModified': '2025-03-09T13:02:15Z'}\n",
      "{'key': '4KKNSYNT', 'version': 4432, 'itemType': 'conferencePaper', 'title': 'Infrared Thermography Based Hotspot Detection Of Photovoltaic Module using YOLO', 'creators': [{'creatorType': 'author', 'firstName': 'Tahmid', 'lastName': 'Tajwar'}, {'creatorType': 'author', 'firstName': 'Ovib', 'lastName': 'Hassan Mobin'}, {'creatorType': 'author', 'firstName': 'Fariha Reza', 'lastName': 'Khan'}, {'creatorType': 'author', 'firstName': 'Shara Fatema', 'lastName': 'Hossain'}, {'creatorType': 'author', 'firstName': 'Mohaimenul', 'lastName': 'Islam'}, {'creatorType': 'author', 'firstName': 'Md.', 'lastName': 'Mosaddequr Rahman'}], 'abstractNote': 'Regarding clean energy production high curiosity is gained by Solar Photovoltaic (PV) worldwide. Faults in the PV modules cause significant issues for the PV systems. Detecting faults of PV modules could help to take the necessary measures. This study uses Infrared thermography (IRT) to detect the hotspot of PV modules. The objective is to develop a hotspot detection tool using ‘YOLO: You Only Look once.’ The images are converted into a data set for a classifier to detect the hotspot of PV modules. Then the learner is trained and tested with the dataset. After that, the output validates with the IRT images of PV modules. The outcome of this study is to apply a real-time object detection tool identifying the defect of the PV module. The result shows that with a more diversified data set, the confidence of detecting the hotspot increases.', 'date': '2021-05', 'proceedingsTitle': '2021 IEEE 12th Energy Conversion Congress & Exposition - Asia (ECCE-Asia)', 'conferenceName': '2021 IEEE 12th Energy Conversion Congress & Exposition - Asia (ECCE-Asia)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1542-1547', 'series': '', 'language': '', 'DOI': '10.1109/ECCE-Asia49820.2021.9478998', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9478998/?arnumber=9478998', 'accessDate': '2025-03-09T12:57:24Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2150-6086', 'tags': [{'tag': 'Asia', 'type': 1}, {'tag': 'Condition monitoring', 'type': 1}, {'tag': 'Detectors', 'type': 1}, {'tag': 'Infrared thermography', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Production', 'type': 1}, {'tag': 'Tools', 'type': 1}, {'tag': 'YOLO', 'type': 1}, {'tag': 'hotspot', 'type': 1}, {'tag': 'machine learning', 'type': 1}, {'tag': 'photovoltaic', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T12:57:24Z', 'dateModified': '2025-03-09T12:57:24Z'}\n",
      "{'key': 'CSZ5ZRWY', 'version': 4424, 'itemType': 'journalArticle', 'title': 'Enhanced photovoltaic panel defect detection via adaptive complementary fusion in YOLO-ACF', 'creators': [{'creatorType': 'author', 'firstName': 'Wenwen', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Xiaofei', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Yilun', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Cao'}, {'creatorType': 'author', 'firstName': 'Yizheng', 'lastName': 'Lang'}, {'creatorType': 'author', 'firstName': 'Yunsheng', 'lastName': 'Qian'}], 'abstractNote': 'Detecting defects on photovoltaic panels using electroluminescence images can significantly enhance the production quality of these panels. Nonetheless, in the process of defect detection, there often arise instances of missed detections and false alarms due to the close resemblance between embedded defect features and the intricate background information. To tackle this challenge, we propose an Adaptive Complementary Fusion (ACF) module designed to intelligently integrate spatial and channel information. This module is seamlessly integrated into YOLOv5 for detecting defects on photovoltaic panels, aiming primarily to enhance model detection performance, achieve model lightweighting, and accelerate detection speed. In order to validate the efficacy of the proposed module, we conducted experiments using a dataset comprising 4500 electroluminescence images of photovoltaic panels. Compared to the cutting-edge detection capability of YOLOv8, our YOLO-ACF method exhibits enhancements of 5.2, 0.8, and 2.3 percentage points in R, mAP50, and mAP50-95, respectively. In contrast to the lightest and fastest YOLOv5, YOLO-ACF achieves reductions of 12.9%, 12.4%, and 4.2% in parameters, weight, and time, respectively, while simultaneously boosting FPS by 5%. Through qualitative and quantitative comparisons with various alternative methods, we demonstrate that our YOLO-ACF strikes a good balance between detection performance, model complexity, and detection speed for defect detection on photovoltaic panels. Moreover, it demonstrates remarkable versatility across a spectrum of defect types.', 'publicationTitle': 'Scientific Reports', 'volume': '14', 'issue': '1', 'pages': '26425', 'date': '2024-11-02', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Sci Rep', 'language': 'en', 'DOI': '10.1038/s41598-024-75772-9', 'ISSN': '2045-2322', 'shortTitle': '', 'url': 'https://www.nature.com/articles/s41598-024-75772-9', 'accessDate': '2025-03-09T12:55:38Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.nature.com', 'callNumber': '', 'rights': '2024 The Author(s)', 'extra': 'Publisher: Nature Publishing Group', 'tags': [{'tag': 'Computer science', 'type': 1}, {'tag': 'Object vision', 'type': 1}, {'tag': 'Photovoltaics', 'type': 1}, {'tag': 'Solar energy and photovoltaic technology', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T12:55:38Z', 'dateModified': '2025-03-09T12:55:38Z'}\n",
      "{'key': 'MIVY4YPK', 'version': 4421, 'itemType': 'conferencePaper', 'title': 'Fault Detection of the Solar Photovoltaic Modules Using YOLO Models', 'creators': [{'creatorType': 'author', 'firstName': 'Parveen', 'lastName': 'Malik'}, {'creatorType': 'author', 'firstName': 'Vatsal', 'lastName': 'Saxena'}, {'creatorType': 'author', 'firstName': 'Shreyansh', 'lastName': 'Raj'}, {'creatorType': 'author', 'firstName': 'Saumy', 'lastName': 'Singh'}, {'creatorType': 'author', 'firstName': 'Sachin', 'lastName': 'Kumar'}, {'creatorType': 'author', 'firstName': 'Ganaraj P.', 'lastName': 'S'}], 'abstractNote': 'The growing adoption of solar panels, driven by climate change concerns, underscores the importance of ensuring the reliability of photovoltaic (PV) modules. However, outdoor PV modules deployment face a range of environmental challenges such as extreme temperatures, chemical exposure, and mechanical stress which can lead to aging, defects, and degradation. This research introduces a novel approach for identifying faults in solar photovoltaic (PV) modules. Leveraging deep learning techniques from the You Only Look Once (YOLO) family, specifically the recent YOLOv8 and YOLOv9 models, this paper aims to enhance the reliability and performance of PV systems by accurately detecting and classifying module defects to a thermal images database containing three photo-voltaic cell defects. By automating the fault detection process through computer vision, this work contributes to the ongoing efforts to optimize solar energy generation and maintenance. Further, YOLOv5, YOLOv6, and YOLOv7 are also trained, validated, and tested. The results showed that the novel technique of the GELAN architecture-based model outperformed all other models trained on the custom dataset of thermal images of solar PV modules, achieving a mean average precision (mAP) of 70.4%.', 'date': '2024-09', 'proceedingsTitle': '2024 IEEE Region 10 Symposium (TENSYMP)', 'conferenceName': '2024 IEEE Region 10 Symposium (TENSYMP)', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-6', 'series': '', 'language': '', 'DOI': '10.1109/TENSYMP61132.2024.10752194', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10752194', 'accessDate': '2025-03-09T12:52:51Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2642-6102', 'tags': [{'tag': 'Computational modeling', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Image databases', 'type': 1}, {'tag': 'Maintenance', 'type': 1}, {'tag': 'Microprocessors', 'type': 1}, {'tag': 'Photo Voltaic', 'type': 1}, {'tag': 'Solar energy', 'type': 1}, {'tag': 'Solar panels', 'type': 1}, {'tag': 'Stress', 'type': 1}, {'tag': 'Temperature distribution', 'type': 1}, {'tag': 'YOLO', 'type': 1}, {'tag': 'generalized efficient layer aggregation network', 'type': 1}, {'tag': 'infrared thermography', 'type': 1}, {'tag': 'mAP', 'type': 1}, {'tag': 'object detection', 'type': 1}, {'tag': 'solar cell', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T12:52:51Z', 'dateModified': '2025-03-09T12:52:51Z'}\n",
      "{'key': 'QXDNMW43', 'version': 4417, 'itemType': 'journalArticle', 'title': 'CEMP-YOLO: An infrared overheat detection model for photovoltaic panels in UAVs', 'creators': [{'creatorType': 'author', 'firstName': 'Yan', 'lastName': 'Hong'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jingming', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Yun', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Shikang', 'lastName': 'Fang'}, {'creatorType': 'author', 'firstName': 'Wen', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Mushi', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Hantao', 'lastName': 'Wang'}], 'abstractNote': 'Aiming at the complex working conditions of actual PV power stations, traditional PV panel detection methods employed by operators still result in some faults and safety risks. Under the framework of the YOLOv10n model, a CEMP-YOLOv10n-based infrared image detection algorithm for photovoltaic power plants is proposed. The improvements in CEMP-YOLOv10n comprise four main components. The ABCG_Block structure was designed, and the C2f structure within the Backbone component was optimized to enhance feature extraction capabilities. The ERepGFPN structure is used in the Neck component to retain semantic information and fuse features between high and low layers. The detector head was optimized with PConv convolution to minimize redundant information. Finally, MECA attention was added before P3, P4, and P5 detection heads to enhance adaptive recognition and accuracy.Experimental validation using infrared UAV imagery of PV panels shows the model’s computational cost decreased to 4.7 GFLOPs, 72.3 % of the original. Parameters and weights decreased by 25.99 % and 24.13 %, respectively, while accuracy and mean average precision (mAP) improved by 8.3% and 2 %, reaching 86.6 % and 87.3 %. Compared to 13 YOLO-series algorithms, including DETR, YOLOv8n, YOLOv9-tiny, and YOLOv11n, the CEMP-YOLOv10n model demonstrates superior accuracy, parameter efficiency, and memory consumption. The CEMP-YOLOv10n model significantly improves defect recognition accuracy, reduces missed detections, and balances lightweight design with detection speed. This lays the foundation for future UAV inspection edge device deployment and smart PV big data platform creation.', 'publicationTitle': 'Digital Signal Processing', 'volume': '161', 'issue': '', 'pages': '105072', 'date': '06/2025', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Digital Signal Processing', 'language': 'en', 'DOI': '10.1016/j.dsp.2025.105072', 'ISSN': '10512004', 'shortTitle': 'CEMP-YOLO', 'url': 'https://linkinghub.elsevier.com/retrieve/pii/S1051200425000946', 'accessDate': '2025-03-09T11:39:57Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T11:39:58Z', 'dateModified': '2025-03-09T11:39:58Z'}\n",
      "{'key': 'I3XUGKIS', 'version': 4409, 'itemType': 'journalArticle', 'title': 'In-Depth Review of YOLOv1 to YOLOv10 Variants for Enhanced Photovoltaic Defect Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Muhammad', 'lastName': 'Hussain'}, {'creatorType': 'author', 'firstName': 'Rahima', 'lastName': 'Khanam'}], 'abstractNote': 'This review presents an investigation into the incremental advancements in the YOLO (You Only Look Once) architecture and its derivatives, with a specific focus on their pivotal contributions to improving quality inspection within the photovoltaic (PV) domain. YOLO’s single-stage approach to object detection has made it a preferred option due to its efficiency. The review unearths key drivers of success in each variant, from path aggregation networks to generalised efficient layer aggregation architectures and programmable gradient information, presented in the latest variant, YOLOv10, released in May 2024. Looking ahead, the review predicts a significant trend in future research, indicating a shift toward refining YOLO variants to tackle a wider array of PV fault scenarios. While current discussions mainly centre on micro-crack detection, there is an acknowledged opportunity for expansion. Researchers are expected to delve deeper into attention mechanisms within the YOLO architecture, recognising their potential to greatly enhance detection capabilities, particularly for subtle and intricate faults.', 'publicationTitle': 'Solar', 'volume': '4', 'issue': '3', 'pages': '351-386', 'date': '2024/9', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3390/solar4030016', 'ISSN': '2673-9941', 'shortTitle': '', 'url': 'https://www.mdpi.com/2673-9941/4/3/16', 'accessDate': '2025-03-09T09:52:23Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.mdpi.com', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/by/3.0/', 'extra': 'Number: 3\\nPublisher: Multidisciplinary Digital Publishing Institute', 'tags': [{'tag': 'YOLO', 'type': 1}, {'tag': 'computer vision', 'type': 1}, {'tag': 'convolutional neural networks', 'type': 1}, {'tag': 'deep learning', 'type': 1}, {'tag': 'object detection', 'type': 1}, {'tag': 'photovoltaic', 'type': 1}, {'tag': 'quality inspection: manufacturing', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T09:52:23Z', 'dateModified': '2025-03-09T09:52:23Z'}\n",
      "{'key': 'DZ3JM9NE', 'version': 4405, 'itemType': 'journalArticle', 'title': 'PV-YOLO: Lightweight YOLO for Photovoltaic Panel Fault Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Wang', 'lastName': 'Yin'}, {'creatorType': 'author', 'firstName': 'Shen', 'lastName': 'Lingxin'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Maohuan'}, {'creatorType': 'author', 'firstName': 'Sun', 'lastName': 'Qianlai'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Xiaosong'}], 'abstractNote': 'The rapid development of the photovoltaic industry in recent years has made the efficient and accurate completion of photovoltaic operation and maintenance a major focus in recent studies. The key to photovoltaic operation and maintenance is the accurate multifault identification of photovoltaic panel images collected using drones. In this paper, PV-YOLO is proposed to replace YOLOX’s backbone network, CSPDarknet53, with a transformer-based PVTv2 network to obtain local connections between images and feature maps to extract more edge-detail features of similar faults. The CBAM attention mechanism is added to enhance the effective features and improve the detection accuracy of small objects. The label assignment mechanism is optimized, and the SIoU loss functionis used to improve the uneven distribution of samples and accelerate network convergence. Experiments on the dataset prove that this method is superior to the existing technology, as the highest mAP value is 92.56%. This value is 10.46% higher than that of YOLOX, and the mAP is optimal under the same parameter magnitude,proving the model’s effectiveness.Moreover, mAP is increased by over 10%, especially for small targets. In this paper, we implemented a lightweight design for the model, and proposes four models of different sizes to be-sized models that are suitable for different detection scenarios.', 'publicationTitle': 'IEEE Access', 'volume': '11', 'issue': '', 'pages': '10966-10976', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/ACCESS.2023.3240894', 'ISSN': '2169-3536', 'shortTitle': 'PV-YOLO', 'url': 'https://ieeexplore.ieee.org/abstract/document/10032147', 'accessDate': '2025-03-09T09:51:30Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Access', 'tags': [{'tag': 'Deep learning', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Mathematical models', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Photovoltaic panel failure', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Transformers', 'type': 1}, {'tag': 'YOLOX', 'type': 1}, {'tag': 'lightweight', 'type': 1}, {'tag': 'target detection', 'type': 1}, {'tag': 'transformer', 'type': 1}], 'collections': ['S9UURNRQ'], 'relations': {}, 'dateAdded': '2025-03-09T09:51:30Z', 'dateModified': '2025-03-09T09:51:30Z'}\n",
      "{'key': 'GBVXNHLK', 'version': 4684, 'itemType': 'preprint', 'title': 'VMamba: Visual State Space Model', 'creators': [{'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yunjie', 'lastName': 'Tian'}, {'creatorType': 'author', 'firstName': 'Yuzhong', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Hongtian', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Lingxi', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Yaowei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qixiang', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Jianbin', 'lastName': 'Jiao'}, {'creatorType': 'author', 'firstName': 'Yunfan', 'lastName': 'Liu'}], 'abstractNote': \"Designing computationally efficient network architectures remains an ongoing necessity in computer vision. In this paper, we adapt Mamba, a state-space language model, into VMamba, a vision backbone with linear time complexity. At the core of VMamba is a stack of Visual State-Space (VSS) blocks with the 2D Selective Scan (SS2D) module. By traversing along four scanning routes, SS2D bridges the gap between the ordered nature of 1D selective scan and the non-sequential structure of 2D vision data, which facilitates the collection of contextual information from various sources and perspectives. Based on the VSS blocks, we develop a family of VMamba architectures and accelerate them through a succession of architectural and implementation enhancements. Extensive experiments demonstrate VMamba's promising performance across diverse visual perception tasks, highlighting its superior input scaling efficiency compared to existing benchmark models. Source code is available at https://github.com/MzeroMiko/VMamba.\", 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2401.10166', 'place': '', 'date': '2024-12-29', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2401.10166', 'citationKey': '', 'url': 'http://arxiv.org/abs/2401.10166', 'accessDate': '2025-03-07T04:21:16Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'VMamba', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2401.10166 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['F6KRC4VK'], 'relations': {}, 'dateAdded': '2025-03-07T04:21:16Z', 'dateModified': '2025-03-07T04:21:18Z'}\n",
      "{'key': 'QDGT5X3R', 'version': 5051, 'itemType': 'conferencePaper', 'title': 'Efficient Multi-Scale Attention Module with Cross-Spatial Learning', 'creators': [{'creatorType': 'author', 'firstName': 'Daliang', 'lastName': 'Ouyang'}, {'creatorType': 'author', 'firstName': 'Su', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Guozhong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Mingzhu', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Huaiyong', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Zhan'}, {'creatorType': 'author', 'firstName': 'Zhijie', 'lastName': 'Huang'}], 'abstractNote': 'Remarkable effectiveness of the channel or spatial attention mechanisms for producing more discernible feature representation are illustrated in various computer vision tasks. However, modeling the cross-channel relationships with channel dimensionality reduction may bring side effect in extracting deep visual representations. In this paper, a novel efficient multi-scale attention (EMA) module is proposed. Focusing on retaining the information on per channel and decreasing the computational overhead, we reshape the partly channels into the batch dimensions and group the channel dimensions into multiple sub-features which make the spatial semantic features well-distributed inside each feature group. Specifically, apart from encoding the global information to re-calibrate the channel-wise weight in each parallel branch, the output features of the two parallel branches are further aggregated by a cross-dimension interaction for capturing pixel-level pairwise relationship. We conduct extensive ablation studies and experiments on image classification and object detection tasks with popular benchmarks (e.g., CIFAR-100, ImageNet-1k, MS COCO and VisDrone2019) for evaluating its performance.', 'date': '2023-6-4', 'proceedingsTitle': 'ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)', 'conferenceName': '', 'place': '', 'publisher': '', 'volume': '', 'pages': '1-5', 'series': '', 'language': '', 'DOI': '10.1109/ICASSP49357.2023.10096516', 'ISBN': '', 'shortTitle': '', 'url': 'http://arxiv.org/abs/2305.13563', 'accessDate': '2025-03-07T02:21:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2305.13563 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-03-07T02:21:10Z', 'dateModified': '2025-03-07T02:21:10Z'}\n",
      "{'key': 'V6P9WPAA', 'version': 4380, 'itemType': 'journalArticle', 'title': 'Hybrid Deep Learning Model for Fault Detection and Classification of Grid-Connected Photovoltaic System', 'creators': [{'creatorType': 'author', 'firstName': 'Moath', 'lastName': 'Alrifaey'}, {'creatorType': 'author', 'firstName': 'Wei Hong', 'lastName': 'Lim'}, {'creatorType': 'author', 'firstName': 'Chun Kit', 'lastName': 'Ang'}, {'creatorType': 'author', 'firstName': 'Elango', 'lastName': 'Natarajan'}, {'creatorType': 'author', 'firstName': 'Mahmud Iwan', 'lastName': 'Solihin'}, {'creatorType': 'author', 'firstName': 'Mohd Rizon Mohamed', 'lastName': 'Juhari'}, {'creatorType': 'author', 'firstName': 'Sew Sun', 'lastName': 'Tiang'}], 'abstractNote': 'Effective fault detection and classification play essential roles in reducing the hazards such as electric shocks and fire in photovoltaic (PV) systems. However, the issues of interest in fault detection and classification for PV systems remain an open-ended challenge due to manual and time-consuming processes that require the relevant domain knowledge and experience of fault diagnoses. This paper proposes a hybrid deep-learning (DL) model-based combined architectures as the novel DL approaches to achieve the real-time automatic fault detection and classification of a PV system. This research employed the wavelet packet transform (WPT) as a data preprocessing technique to handle the PV voltage signal collected and feeding them as the inputs for combined DL architectures that consist of the equilibrium optimizer algorithm (EOA) and long short-term memory (LSTM-SAE) approaches. The combined DL architectures are able to extract the fault features automatically from the preprocessed data without requiring any previous knowledge, therefore can override the traditional shortages of manual feature extraction and manual selection of optimal features from the extracted fault features. These desirable features are anticipated to speed up the fault detection and classification capability of the proposed DL model with higher accuracy. In order to determine the performance of the proposed fault model, we carried out a comprehensive evaluation study on a 250-kW grid-connected PV system. In this paper, symmetrical and asymmetrical faults have been studied involving all the phases and ground faults such as single phase to ground, phases to phase, phase to phase to ground, and three-phase to ground. The simulation results validate the efficacy of the proposed model in terms of computation time, accuracy of fault detection, and noise robustness. Comprehensive comparisons between the simulation results and previous studies demonstrate the multidisciplinary applications of the present study.', 'publicationTitle': 'IEEE Access', 'volume': '10', 'issue': '', 'pages': '13852-13869', 'date': '2022', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/ACCESS.2022.3140287', 'ISSN': '2169-3536', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9668848/?arnumber=9668848', 'accessDate': '2025-03-07T02:01:26Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Access', 'tags': [{'tag': 'Deep distributed energy', 'type': 1}, {'tag': 'Fault detection', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Maintenance engineering', 'type': 1}, {'tag': 'Photovoltaic systems', 'type': 1}, {'tag': 'Renewable energy sources', 'type': 1}, {'tag': 'Support vector machines', 'type': 1}, {'tag': 'Wavelet packets', 'type': 1}, {'tag': 'equilibrium optimizer algorithm (EOA)', 'type': 1}, {'tag': 'fault detection and classification', 'type': 1}, {'tag': 'grid-connected photovoltaic systems', 'type': 1}, {'tag': 'optimal feature selection', 'type': 1}, {'tag': 'wavelet packet transform (WPT)', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-07T02:01:26Z', 'dateModified': '2025-03-07T02:01:26Z'}\n",
      "{'key': 'RCXCFEG8', 'version': 5056, 'itemType': 'conferencePaper', 'title': 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Mingxing', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Quoc', 'lastName': 'Le'}], 'abstractNote': 'Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are given. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al., 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flower (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.', 'date': '2019-05-24', 'proceedingsTitle': 'Proceedings of the 36th International Conference on Machine Learning', 'conferenceName': 'International Conference on Machine Learning', 'place': '', 'publisher': 'PMLR', 'volume': '', 'pages': '6105-6114', 'series': '', 'language': 'en', 'DOI': '', 'ISBN': '', 'shortTitle': 'EfficientNet', 'url': 'https://proceedings.mlr.press/v97/tan19a.html', 'accessDate': '2025-03-06T18:21:11Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'proceedings.mlr.press', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2640-3498', 'tags': [], 'collections': ['56JUBZGW'], 'relations': {}, 'dateAdded': '2025-03-06T18:21:11Z', 'dateModified': '2025-03-06T18:21:11Z'}\n",
      "{'key': 'MDDK8CSL', 'version': 4371, 'itemType': 'preprint', 'title': 'BAF-Detector: An Efficient CNN-Based Detector for Photovoltaic Cell Defect Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Binyi', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Haiyong', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhong', 'lastName': 'Zhou'}], 'abstractNote': 'The multi-scale defect detection for photovoltaic (PV) cell electroluminescence (EL) images is a challenging task, due to the feature vanishing as network deepens. To address this problem, an attention-based top-down and bottom-up architecture is developed to accomplish multi-scale feature fusion. This architecture, called Bidirectional Attention Feature Pyramid Network (BAFPN), can make all layers of the pyramid share similar semantic features. In BAFPN, cosine similarity is employed to measure the importance of each pixel in the fused features. Furthermore, a novel object detector is proposed, called BAF-Detector, which embeds BAFPN into Region Proposal Network (RPN) in Faster RCNN+FPN. BAFPN improves the robustness of the network to scales, thus the proposed detector achieves a good performance in multi-scale defects detection task. Finally, the experimental results on a large-scale EL dataset including 3629 images, 2129 of which are defective, show that the proposed method achieves 98.70% (F-measure), 88.07% (mAP), and 73.29% (IoU) in terms of multi-scale defects classification and detection results in raw PV cell EL images.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2012.10631', 'place': '', 'date': '2021-03-28', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2012.10631', 'citationKey': '', 'url': 'http://arxiv.org/abs/2012.10631', 'accessDate': '2025-03-06T11:51:00Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'BAF-Detector', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2012.10631 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-06T11:51:00Z', 'dateModified': '2025-03-06T11:51:00Z'}\n",
      "{'key': 'JJP42R6K', 'version': 5053, 'itemType': 'journalArticle', 'title': 'EfficientDet: Scalable and Efficient Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Mingxing', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Ruoming', 'lastName': 'Pang'}, {'creatorType': 'author', 'firstName': 'Quoc V', 'lastName': 'Le'}], 'abstractNote': 'Model efﬁciency has become increasingly important in computer vision. In this paper, we systematically study neural network architecture design choices for object detection and propose several key optimizations to improve efﬁciency. First, we propose a weighted bi-directional feature pyramid network (BiFPN), which allows easy and fast multi-scale feature fusion; Second, we propose a compound scaling method that uniformly scales the resolution, depth, and width for all backbone, feature network, and box/class prediction networks at the same time. Based on these optimizations and EfﬁcientNet backbones, we have developed a new family of object detectors, called EfﬁcientDet, which consistently achieve much better efﬁciency than prior art across a wide spectrum of resource constraints. In particular, with single-model and single-scale, our EfﬁcientDetD7 achieves state-of-the-art 52.2 AP on COCO test-dev with 52M parameters and 325B FLOPs1, being 4x – 9x smaller and using 13x – 42x fewer FLOPs than previous detector. Code is available at https://github.com/google/ automl/tree/master/efficientdet.', 'publicationTitle': '', 'volume': '', 'issue': '', 'pages': '', 'date': '', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '', 'ISSN': '', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Zotero', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-03-05T05:49:34Z', 'dateModified': '2025-03-05T05:49:35Z'}\n",
      "{'key': 'P64AG7Y3', 'version': 4343, 'itemType': 'conferencePaper', 'title': 'DETRs Beat YOLOs on Real-time Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yian', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Shangliang', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Jinman', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Guanzhong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qingqing', 'lastName': 'Dang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jie', 'lastName': 'Chen'}], 'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.', 'date': '2024-6-16', 'proceedingsTitle': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '16965-16974', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52733.2024.01605', 'ISBN': '9798350353006', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10657220/', 'accessDate': '2025-03-05T05:32:03Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2025-03-05T05:32:03Z', 'dateModified': '2025-03-05T05:32:03Z'}\n",
      "{'key': 'XHMCU8ZC', 'version': 5043, 'itemType': 'conferencePaper', 'title': 'DETRs Beat YOLOs on Real-time Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Yian', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wenyu', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Shangliang', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Jinman', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Guanzhong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qingqing', 'lastName': 'Dang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Jie', 'lastName': 'Chen'}], 'abstractNote': 'The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RTDETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page: https://zhao-yian.github.io/RTDETR.', 'date': '2024-6-16', 'proceedingsTitle': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '16965-16974', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52733.2024.01605', 'ISBN': '979-8-3503-5300-6', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/10657220/', 'accessDate': '2025-03-05T05:31:58Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-03-05T05:31:58Z', 'dateModified': '2025-03-05T05:31:58Z'}\n",
      "{'key': 'FRDZYSSY', 'version': 4332, 'itemType': 'preprint', 'title': 'Strip R-CNN: Large Strip Convolution for Remote Sensing Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Xinbin', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Zhaohui', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Yuxuan', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xialei', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Qibin', 'lastName': 'Hou'}, {'creatorType': 'author', 'firstName': 'Ming-Ming', 'lastName': 'Cheng'}], 'abstractNote': 'While witnessed with rapid development, remote sensing object detection remains challenging for detecting high aspect ratio objects. This paper shows that large strip convolutions are good feature representation learners for remote sensing object detection and can detect objects of various aspect ratios well. Based on large strip convolutions, we build a new network architecture called Strip R-CNN, which is simple, efficient, and powerful. Unlike recent remote sensing object detectors that leverage large-kernel convolutions with square shapes, our Strip R-CNN takes advantage of sequential orthogonal large strip convolutions to capture spatial information. In addition, we enhance the localization capability of remote-sensing object detectors by decoupling the detection heads and equipping the localization head with strip convolutions to better localize the target objects. Extensive experiments on several benchmarks, e.g., DOTA, FAIR1M, HRSC2016, and DIOR, show that our Strip R-CNN can largely improve previous works. Notably, our 30M model achieves 82.75% mAP on DOTA-v1.0, setting a new state-of-the-art record.Code is available at https://github.com/YXB-NKU/Strip-R-CNN.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.03775', 'place': '', 'date': '2025-01-10', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.03775', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.03775', 'accessDate': '2025-03-02T15:09:42Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Strip R-CNN', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.03775 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-02T15:09:42Z', 'dateModified': '2025-03-02T15:10:09Z'}\n",
      "{'key': 'YFEBPNNV', 'version': 4320, 'itemType': 'conferencePaper', 'title': 'Strip Pooling: Rethinking Spatial Pooling for Scene Parsing', 'creators': [{'creatorType': 'author', 'firstName': 'Qibin', 'lastName': 'Hou'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ming-Ming', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Feng'}], 'abstractNote': 'Spatial pooling has been proven highly effective in capturing long-range contextual information for pixel-wise prediction tasks, such as scene parsing. In this paper, beyond conventional spatial pooling that usually has a regular shape of N × N , we rethink the formulation of spatial pooling by introducing a new pooling strategy, called strip pooling, which considers a long but narrow kernel, i.e., 1 × N or N × 1. Based on strip pooling, we further investigate spatial pooling architecture design by 1) introducing a new strip pooling module that enables backbone networks to efﬁciently model long-range dependencies, 2) presenting a novel building block with diverse spatial pooling as a core, and 3) systematically comparing the performance of the proposed strip pooling and conventional spatial pooling techniques. Both novel pooling-based designs are lightweight and can serve as an efﬁcient plugand-play module in existing scene parsing networks. Extensive experiments on popular benchmarks (e.g., ADE20K and Cityscapes) demonstrate that our simple approach establishes new state-of-the-art results. Code is available at https://github.com/Andrew-Qibin/SPNet.', 'date': '6/2020', 'proceedingsTitle': '2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Seattle, WA, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '4002-4011', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR42600.2020.00406', 'ISBN': '978-1-72817-168-5', 'shortTitle': 'Strip Pooling', 'url': 'https://ieeexplore.ieee.org/document/9157204/', 'accessDate': '2025-03-02T14:42:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html', 'extra': '', 'tags': [], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-03-02T14:42:12Z', 'dateModified': '2025-03-02T14:42:12Z'}\n",
      "{'key': 'LI8LGH7Y', 'version': 5040, 'itemType': 'preprint', 'title': 'Deformable DETR: Deformable Transformers for End-to-End Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Xizhou', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Weijie', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Lewei', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Bin', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xiaogang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jifeng', 'lastName': 'Dai'}], 'abstractNote': 'DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. Code is released at https://github.com/fundamentalvision/Deformable-DETR.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2010.04159', 'place': '', 'date': '2021-03-18', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2010.04159', 'citationKey': '', 'url': 'http://arxiv.org/abs/2010.04159', 'accessDate': '2025-02-28T02:31:35Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'Deformable DETR', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2010.04159 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['4KEZKRYY'], 'relations': {}, 'dateAdded': '2025-02-28T02:31:35Z', 'dateModified': '2025-02-28T02:31:35Z'}\n",
      "{'key': 'PFMTEF63', 'version': 5094, 'itemType': 'preprint', 'title': 'YOLOv12: Attention-Centric Real-Time Object Detectors', 'creators': [{'creatorType': 'author', 'firstName': 'Yunjie', 'lastName': 'Tian'}, {'creatorType': 'author', 'firstName': 'Qixiang', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'David', 'lastName': 'Doermann'}], 'abstractNote': 'Enhancing the network architecture of the YOLO framework has been crucial for a long time, but has focused on CNN-based improvements despite the proven superiority of attention mechanisms in modeling capabilities. This is because attention-based models cannot match the speed of CNN-based models. This paper proposes an attention-centric YOLO framework, namely YOLOv12, that matches the speed of previous CNN-based ones while harnessing the performance benefits of attention mechanisms. YOLOv12 surpasses all popular real-time object detectors in accuracy with competitive speed. For example, YOLOv12-N achieves 40.6% mAP with an inference latency of 1.64 ms on a T4 GPU, outperforming advanced YOLOv10-N / YOLOv11-N by 2.1%/1.2% mAP with a comparable speed. This advantage extends to other model scales. YOLOv12 also surpasses end-to-end real-time detectors that improve DETR, such as RT-DETR / RT-DETRv2: YOLOv12-S beats RT-DETR-R18 / RT-DETRv2-R18 while running 42% faster, using only 36% of the computation and 45% of the parameters. More comparisons are shown in Figure 1.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2502.12524', 'place': '', 'date': '2025-02-18', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2502.12524', 'citationKey': '', 'url': 'http://arxiv.org/abs/2502.12524', 'accessDate': '2025-02-25T01:18:53Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'YOLOv12', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2502.12524 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-02-25T01:18:54Z', 'dateModified': '2025-02-25T01:18:54Z'}\n",
      "{'key': 'AF3M897I', 'version': 5022, 'itemType': 'preprint', 'title': 'End-to-End Object Detection with Transformers', 'creators': [{'creatorType': 'author', 'firstName': 'Nicolas', 'lastName': 'Carion'}, {'creatorType': 'author', 'firstName': 'Francisco', 'lastName': 'Massa'}, {'creatorType': 'author', 'firstName': 'Gabriel', 'lastName': 'Synnaeve'}, {'creatorType': 'author', 'firstName': 'Nicolas', 'lastName': 'Usunier'}, {'creatorType': 'author', 'firstName': 'Alexander', 'lastName': 'Kirillov'}, {'creatorType': 'author', 'firstName': 'Sergey', 'lastName': 'Zagoruyko'}], 'abstractNote': 'We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. Training code and pretrained models are available at https://github.com/facebookresearch/detr.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2005.12872', 'place': '', 'date': '2020-05-28', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2005.12872', 'citationKey': '', 'url': 'http://arxiv.org/abs/2005.12872', 'accessDate': '2025-02-24T12:53:24Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2005.12872 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2025-02-24T12:53:24Z', 'dateModified': '2025-02-24T12:53:24Z'}\n",
      "{'key': 'VMLK2W68', 'version': 5024, 'itemType': 'preprint', 'title': 'BiFormer: Vision Transformer with Bi-Level Routing Attention', 'creators': [{'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Xinjiang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Zhanghan', 'lastName': 'Ke'}, {'creatorType': 'author', 'firstName': 'Wayne', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Rynson', 'lastName': 'Lau'}], 'abstractNote': 'As the core building block of vision transformers, attention is a powerful tool to capture long-range dependency. However, such power comes at a cost: it incurs a huge computation burden and heavy memory footprint as pairwise token interaction across all spatial locations is computed. A series of works attempt to alleviate this problem by introducing handcrafted and content-agnostic sparsity into attention, such as restricting the attention operation to be inside local windows, axial stripes, or dilated windows. In contrast to these approaches, we propose a novel dynamic sparse attention via bi-level routing to enable a more flexible allocation of computations with content awareness. Specifically, for a query, irrelevant key-value pairs are first filtered out at a coarse region level, and then fine-grained token-to-token attention is applied in the union of remaining candidate regions (\\\\ie, routed regions). We provide a simple yet effective implementation of the proposed bi-level routing attention, which utilizes the sparsity to save both computation and memory while involving only GPU-friendly dense matrix multiplications. Built with the proposed bi-level routing attention, a new general vision transformer, named BiFormer, is then presented. As BiFormer attends to a small subset of relevant tokens in a \\\\textbf{query adaptive} manner without distraction from other irrelevant ones, it enjoys both good performance and high computational efficiency, especially in dense prediction tasks. Empirical results across several computer vision tasks such as image classification, object detection, and semantic segmentation verify the effectiveness of our design. Code is available at \\\\url{https://github.com/rayleizhu/BiFormer}.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2303.08810', 'place': '', 'date': '2023-03-15', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2303.08810', 'citationKey': '', 'url': 'http://arxiv.org/abs/2303.08810', 'accessDate': '2025-02-24T00:58:32Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'BiFormer', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2303.08810 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-02-24T00:58:33Z', 'dateModified': '2025-02-24T00:58:33Z'}\n",
      "{'key': 'H4LVHVQX', 'version': 5070, 'itemType': 'preprint', 'title': 'Masked Autoencoders Are Scalable Vision Learners', 'creators': [{'creatorType': 'author', 'firstName': 'Kaiming', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Xinlei', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Saining', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Yanghao', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Piotr', 'lastName': 'Dollár'}, {'creatorType': 'author', 'firstName': 'Ross', 'lastName': 'Girshick'}], 'abstractNote': 'This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream tasks outperforms supervised pre-training and shows promising scaling behavior.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2111.06377', 'place': '', 'date': '2021-12-19', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2111.06377', 'citationKey': '', 'url': 'http://arxiv.org/abs/2111.06377', 'accessDate': '2025-02-22T14:27:13Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2111.06377 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-02-22T14:27:13Z', 'dateModified': '2025-02-22T14:27:14Z'}\n",
      "{'key': 'SAADAT7A', 'version': 5072, 'itemType': 'journalArticle', 'title': 'MetaFormer Is Actually What You Need for Vision', 'creators': [{'creatorType': 'author', 'firstName': 'Weihao', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Mi', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Pan', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Chenyang', 'lastName': 'Si'}, {'creatorType': 'author', 'firstName': 'Yichen', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Xinchao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Shuicheng', 'lastName': 'Yan'}], 'abstractNote': 'Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model’s performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vi∗Work done during an internship at Sea AI Lab.', 'publicationTitle': '', 'volume': '', 'issue': '', 'pages': '', 'date': '', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '', 'ISSN': '', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Zotero', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['9YTZVR7S'], 'relations': {}, 'dateAdded': '2025-02-18T10:11:42Z', 'dateModified': '2025-02-18T10:11:42Z'}\n",
      "{'key': 'AR9TRNU6', 'version': 5085, 'itemType': 'preprint', 'title': 'Rich feature hierarchies for accurate object detection and semantic segmentation', 'creators': [{'creatorType': 'author', 'firstName': 'Ross', 'lastName': 'Girshick'}, {'creatorType': 'author', 'firstName': 'Jeff', 'lastName': 'Donahue'}, {'creatorType': 'author', 'firstName': 'Trevor', 'lastName': 'Darrell'}, {'creatorType': 'author', 'firstName': 'Jitendra', 'lastName': 'Malik'}], 'abstractNote': 'Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30% relative to the previous best result on VOC 2012---achieving a mAP of 53.3%. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also compare R-CNN to OverFeat, a recently proposed sliding-window detector based on a similar CNN architecture. We find that R-CNN outperforms OverFeat by a large margin on the 200-class ILSVRC2013 detection dataset. Source code for the complete system is available at http://www.cs.berkeley.edu/~rbg/rcnn.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:1311.2524', 'place': '', 'date': '2014-10-22', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.1311.2524', 'citationKey': '', 'url': 'http://arxiv.org/abs/1311.2524', 'accessDate': '2025-02-12T10:46:32Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:1311.2524 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-12T10:46:32Z', 'dateModified': '2025-02-12T10:46:32Z'}\n",
      "{'key': 'MSYEEWNK', 'version': 5059, 'itemType': 'preprint', 'title': 'Fast R-CNN', 'creators': [{'creatorType': 'author', 'firstName': 'Ross', 'lastName': 'Girshick'}], 'abstractNote': 'This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. Fast R-CNN is implemented in Python and C++ (using Caffe) and is available under the open-source MIT License at https://github.com/rbgirshick/fast-rcnn.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:1504.08083', 'place': '', 'date': '2015-09-27', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.1504.08083', 'citationKey': '', 'url': 'http://arxiv.org/abs/1504.08083', 'accessDate': '2025-02-12T10:22:40Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:1504.08083 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-12T10:22:40Z', 'dateModified': '2025-02-12T10:22:40Z'}\n",
      "{'key': 'CCBTFP2Y', 'version': 5059, 'itemType': 'journalArticle', 'title': 'Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Shaoqing', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Kaiming', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Ross', 'lastName': 'Girshick'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Sun'}], 'abstractNote': \"State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet [1] and Fast R-CNN [2] have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network(RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features-using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model [3], our detection system has a frame rate of 5 fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.\", 'publicationTitle': 'IEEE Transactions on Pattern Analysis and Machine Intelligence', 'volume': '39', 'issue': '6', 'pages': '1137-1149', 'date': '2017-06', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TPAMI.2016.2577031', 'ISSN': '1939-3539', 'shortTitle': 'Faster R-CNN', 'url': 'https://ieeexplore.ieee.org/abstract/document/7485869', 'accessDate': '2025-02-12T10:18:40Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence', 'tags': [{'tag': 'Convolutional codes', 'type': 1}, {'tag': 'Detectors', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Object detection', 'type': 1}, {'tag': 'Proposals', 'type': 1}, {'tag': 'Search problems', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'convolutional neural network', 'type': 1}, {'tag': 'region proposal', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-12T10:18:40Z', 'dateModified': '2025-02-12T10:18:40Z'}\n",
      "{'key': 'AGBIVQEY', 'version': 5034, 'itemType': 'preprint', 'title': 'Attention Is All You Need', 'creators': [{'creatorType': 'author', 'firstName': 'Ashish', 'lastName': 'Vaswani'}, {'creatorType': 'author', 'firstName': 'Noam', 'lastName': 'Shazeer'}, {'creatorType': 'author', 'firstName': 'Niki', 'lastName': 'Parmar'}, {'creatorType': 'author', 'firstName': 'Jakob', 'lastName': 'Uszkoreit'}, {'creatorType': 'author', 'firstName': 'Llion', 'lastName': 'Jones'}, {'creatorType': 'author', 'firstName': 'Aidan N.', 'lastName': 'Gomez'}, {'creatorType': 'author', 'firstName': 'Lukasz', 'lastName': 'Kaiser'}, {'creatorType': 'author', 'firstName': 'Illia', 'lastName': 'Polosukhin'}], 'abstractNote': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:1706.03762', 'place': '', 'date': '2023-08-02', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.1706.03762', 'citationKey': '', 'url': 'http://arxiv.org/abs/1706.03762', 'accessDate': '2025-02-04T07:58:22Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': 'en', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:1706.03762 [cs]', 'tags': [{'tag': 'Computer Science - Computation and Language', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:58:22Z', 'dateModified': '2025-02-10T15:15:09Z'}\n",
      "{'key': 'YWII2CG3', 'version': 5081, 'itemType': 'conferencePaper', 'title': 'Oriented R-CNN for Object Detection', 'creators': [{'creatorType': 'author', 'firstName': 'Xingxing', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Gong', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Jiabao', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiwen', 'lastName': 'Yao'}, {'creatorType': 'author', 'firstName': 'Junwei', 'lastName': 'Han'}], 'abstractNote': '', 'date': '2021', 'proceedingsTitle': '', 'conferenceName': 'Proceedings of the IEEE/CVF International Conference on Computer Vision', 'place': '', 'publisher': '', 'volume': '', 'pages': '3520-3529', 'series': '', 'language': 'en', 'DOI': '', 'ISBN': '', 'shortTitle': '', 'url': 'https://openaccess.thecvf.com/content/ICCV2021/html/Xie_Oriented_R-CNN_for_Object_Detection_ICCV_2021_paper.html', 'accessDate': '2025-02-10T15:09:40Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'openaccess.thecvf.com', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['M6XZJHS6'], 'relations': {}, 'dateAdded': '2025-02-10T15:09:40Z', 'dateModified': '2025-02-10T15:09:40Z'}\n",
      "{'key': 'BE7QF59M', 'version': 5065, 'itemType': 'conferencePaper', 'title': 'ImageNet Classification with Deep Convolutional Neural Networks', 'creators': [{'creatorType': 'author', 'firstName': 'Alex', 'lastName': 'Krizhevsky'}, {'creatorType': 'author', 'firstName': 'Ilya', 'lastName': 'Sutskever'}, {'creatorType': 'author', 'firstName': 'Geoffrey E', 'lastName': 'Hinton'}], 'abstractNote': 'We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\\\\% and 18.9\\\\% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of convolutional nets. To reduce overfitting in the globally connected layers we employed a new regularization method that proved to be very effective.', 'date': '2012', 'proceedingsTitle': 'Advances in Neural Information Processing Systems', 'conferenceName': '', 'place': '', 'publisher': 'Curran Associates, Inc.', 'volume': '25', 'pages': '', 'series': '', 'language': '', 'DOI': '', 'ISBN': '', 'shortTitle': '', 'url': 'https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html', 'accessDate': '2025-02-10T01:46:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Neural Information Processing Systems', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-10T01:46:00Z', 'dateModified': '2025-02-10T01:46:00Z'}\n",
      "{'key': '865NJLRE', 'version': 5068, 'itemType': 'preprint', 'title': 'Mask R-CNN', 'creators': [{'creatorType': 'author', 'firstName': 'Kaiming', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Georgia', 'lastName': 'Gkioxari'}, {'creatorType': 'author', 'firstName': 'Piotr', 'lastName': 'Dollár'}, {'creatorType': 'author', 'firstName': 'Ross', 'lastName': 'Girshick'}], 'abstractNote': 'We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. Code has been made available at: https://github.com/facebookresearch/Detectron', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:1703.06870', 'place': '', 'date': '2018-01-24', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.1703.06870', 'citationKey': '', 'url': 'http://arxiv.org/abs/1703.06870', 'accessDate': '2025-02-10T01:30:57Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:1703.06870 [cs]', 'tags': [{'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-10T01:30:57Z', 'dateModified': '2025-02-10T01:30:57Z'}\n",
      "{'key': 'ISD9MU3G', 'version': 5026, 'itemType': 'preprint', 'title': 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale', 'creators': [{'creatorType': 'author', 'firstName': 'Alexey', 'lastName': 'Dosovitskiy'}, {'creatorType': 'author', 'firstName': 'Lucas', 'lastName': 'Beyer'}, {'creatorType': 'author', 'firstName': 'Alexander', 'lastName': 'Kolesnikov'}, {'creatorType': 'author', 'firstName': 'Dirk', 'lastName': 'Weissenborn'}, {'creatorType': 'author', 'firstName': 'Xiaohua', 'lastName': 'Zhai'}, {'creatorType': 'author', 'firstName': 'Thomas', 'lastName': 'Unterthiner'}, {'creatorType': 'author', 'firstName': 'Mostafa', 'lastName': 'Dehghani'}, {'creatorType': 'author', 'firstName': 'Matthias', 'lastName': 'Minderer'}, {'creatorType': 'author', 'firstName': 'Georg', 'lastName': 'Heigold'}, {'creatorType': 'author', 'firstName': 'Sylvain', 'lastName': 'Gelly'}, {'creatorType': 'author', 'firstName': 'Jakob', 'lastName': 'Uszkoreit'}, {'creatorType': 'author', 'firstName': 'Neil', 'lastName': 'Houlsby'}], 'abstractNote': 'While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2010.11929', 'place': '', 'date': '2021-06-03', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2010.11929', 'citationKey': '', 'url': 'http://arxiv.org/abs/2010.11929', 'accessDate': '2025-02-04T07:57:39Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'An Image is Worth 16x16 Words', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2010.11929 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computer Vision and Pattern Recognition', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:57:39Z', 'dateModified': '2025-02-04T07:57:39Z'}\n",
      "{'key': 'PZ9Y7RSL', 'version': 5093, 'itemType': 'conferencePaper', 'title': 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows', 'creators': [{'creatorType': 'author', 'firstName': 'Ze', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yutong', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Cao'}, {'creatorType': 'author', 'firstName': 'Han', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Yixuan', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Zheng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Stephen', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Baining', 'lastName': 'Guo'}], 'abstractNote': 'This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efﬁciency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the ﬂexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classiﬁcation (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO testdev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-theart by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneﬁcial for all-MLP architectures. The code and models are publicly available at https://github. com/microsoft/Swin-Transformer.', 'date': '10/2021', 'proceedingsTitle': '2021 IEEE/CVF International Conference on Computer Vision (ICCV)', 'conferenceName': '2021 IEEE/CVF International Conference on Computer Vision (ICCV)', 'place': 'Montreal, QC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '9992-10002', 'series': '', 'language': 'en', 'DOI': '10.1109/ICCV48922.2021.00986', 'ISBN': '978-1-6654-2812-5', 'shortTitle': 'Swin Transformer', 'url': 'https://ieeexplore.ieee.org/document/9710580/', 'accessDate': '2025-02-04T07:56:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:56:10Z', 'dateModified': '2025-02-04T07:56:10Z'}\n",
      "{'key': 'XD4FMS37', 'version': 5037, 'itemType': 'preprint', 'title': 'DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning', 'creators': [{'creatorType': 'author', 'firstName': '', 'lastName': 'DeepSeek-AI'}, {'creatorType': 'author', 'firstName': 'Daya', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Dejian', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Haowei', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Junxiao', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Ruoyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Runxin', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Qihao', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Shirong', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Peiyi', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Bi'}, {'creatorType': 'author', 'firstName': 'Xiaokang', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xingkai', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Z. F.', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Zhibin', 'lastName': 'Gou'}, {'creatorType': 'author', 'firstName': 'Zhihong', 'lastName': 'Shao'}, {'creatorType': 'author', 'firstName': 'Zhuoshu', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ziyi', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Aixin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Bing', 'lastName': 'Xue'}, {'creatorType': 'author', 'firstName': 'Bingxuan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Bochao', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Bei', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Chengda', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Chenggang', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Chengqi', 'lastName': 'Deng'}, {'creatorType': 'author', 'firstName': 'Chenyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Chong', 'lastName': 'Ruan'}, {'creatorType': 'author', 'firstName': 'Damai', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Deli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Dongjie', 'lastName': 'Ji'}, {'creatorType': 'author', 'firstName': 'Erhang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Fangyun', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Fucong', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Fuli', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Guangbo', 'lastName': 'Hao'}, {'creatorType': 'author', 'firstName': 'Guanting', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Guowei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'H.', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Han', 'lastName': 'Bao'}, {'creatorType': 'author', 'firstName': 'Hanwei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Haocheng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Honghui', 'lastName': 'Ding'}, {'creatorType': 'author', 'firstName': 'Huajian', 'lastName': 'Xin'}, {'creatorType': 'author', 'firstName': 'Huazuo', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Hui', 'lastName': 'Qu'}, {'creatorType': 'author', 'firstName': 'Hui', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jianzhong', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jiawei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jingchang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Jingyang', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Junjie', 'lastName': 'Qiu'}, {'creatorType': 'author', 'firstName': 'Junlong', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'J. L.', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Jiaqi', 'lastName': 'Ni'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Liang'}, {'creatorType': 'author', 'firstName': 'Jin', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Kaige', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Kang', 'lastName': 'Guan'}, {'creatorType': 'author', 'firstName': 'Kexin', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Kuai', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Lean', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Lecong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Liang', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Litong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Liyue', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Leyi', 'lastName': 'Xia'}, {'creatorType': 'author', 'firstName': 'Mingchuan', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Minghua', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Minghui', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Meng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Miaojun', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Mingming', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ning', 'lastName': 'Tian'}, {'creatorType': 'author', 'firstName': 'Panpan', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Peng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Qiancheng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qinyu', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Qiushi', 'lastName': 'Du'}, {'creatorType': 'author', 'firstName': 'Ruiqi', 'lastName': 'Ge'}, {'creatorType': 'author', 'firstName': 'Ruisong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ruizhe', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Runji', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'R. J.', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'R. L.', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Ruyi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shanghao', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Shangyan', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shanhuang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shengfeng', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Shiyu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Shuiping', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Shunfeng', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shuting', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'S. S.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Shuang', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shaoqing', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Shengfeng', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Tao', 'lastName': 'Yun'}, {'creatorType': 'author', 'firstName': 'Tian', 'lastName': 'Pei'}, {'creatorType': 'author', 'firstName': 'Tianyu', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'T.', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Wangding', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Wanjia', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wen', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Wenfeng', 'lastName': 'Liang'}, {'creatorType': 'author', 'firstName': 'Wenjun', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Wenqin', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Wentao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'W. L.', 'lastName': 'Xiao'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'An'}, {'creatorType': 'author', 'firstName': 'Xiaodong', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiaohan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiaokang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiaotao', 'lastName': 'Nie'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Xingchao', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xinyu', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Xinyuan', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xuecheng', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Xuheng', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'X. Q.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xiangyue', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Xiaojin', 'lastName': 'Shen'}, {'creatorType': 'author', 'firstName': 'Xiaosha', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiaowen', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Xiaoxiang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xinnan', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Xinyi', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Xianzu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xinxia', 'lastName': 'Shan'}, {'creatorType': 'author', 'firstName': 'Y. K.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Y. Q.', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Y. X.', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yanhong', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yao', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yao', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Yaofeng', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Yaohui', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Yichao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yifan', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Yiliang', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Yishi', 'lastName': 'Piao'}, {'creatorType': 'author', 'firstName': 'Yisong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yixuan', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Yiyang', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Yiyuan', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yongqiang', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Yuan', 'lastName': 'Ou'}, {'creatorType': 'author', 'firstName': 'Yuduan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Gong'}, {'creatorType': 'author', 'firstName': 'Yuheng', 'lastName': 'Zou'}, {'creatorType': 'author', 'firstName': 'Yujia', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Yunfan', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Yuxiang', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Yuxiang', 'lastName': 'You'}, {'creatorType': 'author', 'firstName': 'Yuxuan', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yuyang', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Y. X.', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Yanhong', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yanping', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Yaohui', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Yuchen', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Yunxian', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Yukun', 'lastName': 'Zha'}, {'creatorType': 'author', 'firstName': 'Yuting', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Z. Z.', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Zehui', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Zhangli', 'lastName': 'Sha'}, {'creatorType': 'author', 'firstName': 'Zhe', 'lastName': 'Fu'}, {'creatorType': 'author', 'firstName': 'Zhean', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Zhenda', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Zhengyan', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Zhewen', 'lastName': 'Hao'}, {'creatorType': 'author', 'firstName': 'Zhicheng', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Zhigang', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Zhiyu', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Zihui', 'lastName': 'Gu'}, {'creatorType': 'author', 'firstName': 'Zijia', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Zijun', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Zilin', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ziwei', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Ziyang', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Zizheng', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Zhen', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Zhipeng', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Zhongyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Zhen', 'lastName': 'Zhang'}], 'abstractNote': 'We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2501.12948', 'place': '', 'date': '2025-01-22', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2501.12948', 'citationKey': '', 'url': 'http://arxiv.org/abs/2501.12948', 'accessDate': '2025-02-04T07:55:07Z', 'archive': '', 'archiveLocation': '', 'shortTitle': 'DeepSeek-R1', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2501.12948 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computation and Language', 'type': 1}, {'tag': 'Computer Science - Machine Learning', 'type': 1}], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-02-04T07:55:07Z', 'dateModified': '2025-02-04T07:55:07Z'}\n",
      "{'key': '8PFNZ8S9', 'version': 5038, 'itemType': 'preprint', 'title': 'DeepSeek-V3 Technical Report', 'creators': [{'creatorType': 'author', 'firstName': '', 'lastName': 'DeepSeek-AI'}, {'creatorType': 'author', 'firstName': 'Aixin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Bei', 'lastName': 'Feng'}, {'creatorType': 'author', 'firstName': 'Bing', 'lastName': 'Xue'}, {'creatorType': 'author', 'firstName': 'Bingxuan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Bochao', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Chengda', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Chenggang', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Chengqi', 'lastName': 'Deng'}, {'creatorType': 'author', 'firstName': 'Chenyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Chong', 'lastName': 'Ruan'}, {'creatorType': 'author', 'firstName': 'Damai', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Daya', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Dejian', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Deli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Dongjie', 'lastName': 'Ji'}, {'creatorType': 'author', 'firstName': 'Erhang', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Fangyun', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Fucong', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Fuli', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Guangbo', 'lastName': 'Hao'}, {'creatorType': 'author', 'firstName': 'Guanting', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Guowei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'H.', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Han', 'lastName': 'Bao'}, {'creatorType': 'author', 'firstName': 'Hanwei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Haocheng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Haowei', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Honghui', 'lastName': 'Ding'}, {'creatorType': 'author', 'firstName': 'Huajian', 'lastName': 'Xin'}, {'creatorType': 'author', 'firstName': 'Huazuo', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Hui', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Hui', 'lastName': 'Qu'}, {'creatorType': 'author', 'firstName': 'J. L.', 'lastName': 'Cai'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Liang'}, {'creatorType': 'author', 'firstName': 'Jianzhong', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Jiaqi', 'lastName': 'Ni'}, {'creatorType': 'author', 'firstName': 'Jiashi', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Jiawei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Jin', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Jingchang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Jingyang', 'lastName': 'Yuan'}, {'creatorType': 'author', 'firstName': 'Junjie', 'lastName': 'Qiu'}, {'creatorType': 'author', 'firstName': 'Junlong', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Junxiao', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Hu'}, {'creatorType': 'author', 'firstName': 'Kaige', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Kang', 'lastName': 'Guan'}, {'creatorType': 'author', 'firstName': 'Kexin', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Kuai', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Lean', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Lecong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Lei', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Leyi', 'lastName': 'Xia'}, {'creatorType': 'author', 'firstName': 'Liang', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Litong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Liyue', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Meng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Miaojun', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Mingchuan', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Minghua', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Minghui', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Mingming', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ning', 'lastName': 'Tian'}, {'creatorType': 'author', 'firstName': 'Panpan', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Peiyi', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Peng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Qiancheng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qihao', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Qinyu', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Qiushi', 'lastName': 'Du'}, {'creatorType': 'author', 'firstName': 'R. J.', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'R. L.', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Ruiqi', 'lastName': 'Ge'}, {'creatorType': 'author', 'firstName': 'Ruisong', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ruizhe', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Runji', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Runxin', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Ruoyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Ruyi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'S. S.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Shanghao', 'lastName': 'Lu'}, {'creatorType': 'author', 'firstName': 'Shangyan', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shanhuang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Shaoqing', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Shengfeng', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Shengfeng', 'lastName': 'Ye'}, {'creatorType': 'author', 'firstName': 'Shirong', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Shiyu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Shuang', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shuiping', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Shunfeng', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Shuting', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'T.', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Tao', 'lastName': 'Yun'}, {'creatorType': 'author', 'firstName': 'Tian', 'lastName': 'Pei'}, {'creatorType': 'author', 'firstName': 'Tianyu', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'W. L.', 'lastName': 'Xiao'}, {'creatorType': 'author', 'firstName': 'Wangding', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Wanjia', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'An'}, {'creatorType': 'author', 'firstName': 'Wen', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Wenfeng', 'lastName': 'Liang'}, {'creatorType': 'author', 'firstName': 'Wenjun', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Wenqin', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Wentao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'X. Q.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xiangyue', 'lastName': 'Jin'}, {'creatorType': 'author', 'firstName': 'Xianzu', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Bi'}, {'creatorType': 'author', 'firstName': 'Xiaodong', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xiaohan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xiaojin', 'lastName': 'Shen'}, {'creatorType': 'author', 'firstName': 'Xiaokang', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiaokang', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xiaosha', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiaotao', 'lastName': 'Nie'}, {'creatorType': 'author', 'firstName': 'Xiaowen', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Xiaoxiang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Cheng'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xin', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Xingchao', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xingkai', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Xinnan', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Xinxia', 'lastName': 'Shan'}, {'creatorType': 'author', 'firstName': 'Xinyi', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Xinyu', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Xinyuan', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xuecheng', 'lastName': 'Su'}, {'creatorType': 'author', 'firstName': 'Xuheng', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Y. K.', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Y. Q.', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Y. X.', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Y. X.', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yanhong', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yanhong', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Yanping', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Yao', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yao', 'lastName': 'Zhao'}, {'creatorType': 'author', 'firstName': 'Yaofeng', 'lastName': 'Sun'}, {'creatorType': 'author', 'firstName': 'Yaohui', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yaohui', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Yu'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Yichao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yifan', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Yiliang', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Tang'}, {'creatorType': 'author', 'firstName': 'Yishi', 'lastName': 'Piao'}, {'creatorType': 'author', 'firstName': 'Yisong', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yixuan', 'lastName': 'Tan'}, {'creatorType': 'author', 'firstName': 'Yiyang', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Yiyuan', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yongqiang', 'lastName': 'Guo'}, {'creatorType': 'author', 'firstName': 'Yu', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Yuan', 'lastName': 'Ou'}, {'creatorType': 'author', 'firstName': 'Yuchen', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Yuduan', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Gong'}, {'creatorType': 'author', 'firstName': 'Yuheng', 'lastName': 'Zou'}, {'creatorType': 'author', 'firstName': 'Yujia', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Yukun', 'lastName': 'Zha'}, {'creatorType': 'author', 'firstName': 'Yunfan', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Yunxian', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Yuting', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Yuxiang', 'lastName': 'Luo'}, {'creatorType': 'author', 'firstName': 'Yuxiang', 'lastName': 'You'}, {'creatorType': 'author', 'firstName': 'Yuxuan', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Yuyang', 'lastName': 'Zhou'}, {'creatorType': 'author', 'firstName': 'Z. F.', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Z. Z.', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Zehui', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Zhangli', 'lastName': 'Sha'}, {'creatorType': 'author', 'firstName': 'Zhe', 'lastName': 'Fu'}, {'creatorType': 'author', 'firstName': 'Zhean', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Zhen', 'lastName': 'Huang'}, {'creatorType': 'author', 'firstName': 'Zhen', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Zhenda', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Zhengyan', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Zhewen', 'lastName': 'Hao'}, {'creatorType': 'author', 'firstName': 'Zhibin', 'lastName': 'Gou'}, {'creatorType': 'author', 'firstName': 'Zhicheng', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Zhigang', 'lastName': 'Yan'}, {'creatorType': 'author', 'firstName': 'Zhihong', 'lastName': 'Shao'}, {'creatorType': 'author', 'firstName': 'Zhipeng', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Zhiyu', 'lastName': 'Wu'}, {'creatorType': 'author', 'firstName': 'Zhongyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Zhuoshu', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Zihui', 'lastName': 'Gu'}, {'creatorType': 'author', 'firstName': 'Zijia', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Zijun', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Zilin', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ziwei', 'lastName': 'Xie'}, {'creatorType': 'author', 'firstName': 'Ziyang', 'lastName': 'Song'}, {'creatorType': 'author', 'firstName': 'Ziyi', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Zizheng', 'lastName': 'Pan'}], 'abstractNote': 'We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3.', 'genre': '', 'repository': 'arXiv', 'archiveID': 'arXiv:2412.19437', 'place': '', 'date': '2024-12-27', 'series': '', 'seriesNumber': '', 'DOI': '10.48550/arXiv.2412.19437', 'citationKey': '', 'url': 'http://arxiv.org/abs/2412.19437', 'accessDate': '2025-02-04T07:54:12Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': '', 'libraryCatalog': 'arXiv.org', 'callNumber': '', 'rights': '', 'extra': 'arXiv:2412.19437 [cs]', 'tags': [{'tag': 'Computer Science - Artificial Intelligence', 'type': 1}, {'tag': 'Computer Science - Computation and Language', 'type': 1}], 'collections': ['7TU9FL49'], 'relations': {}, 'dateAdded': '2025-02-04T07:54:12Z', 'dateModified': '2025-02-04T07:54:12Z'}\n",
      "{'key': 'YZF4Q3TW', 'version': 5032, 'itemType': 'conferencePaper', 'title': 'Deep Residual Learning for Image Recognition', 'creators': [{'creatorType': 'author', 'firstName': 'Kaiming', 'lastName': 'He'}, {'creatorType': 'author', 'firstName': 'Xiangyu', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Shaoqing', 'lastName': 'Ren'}, {'creatorType': 'author', 'firstName': 'Jian', 'lastName': 'Sun'}], 'abstractNote': 'Deeper neural networks are more difﬁcult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers—8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classiﬁcation task. We also present analysis on CIFAR-10 with 100 and 1000 layers.', 'date': '6/2016', 'proceedingsTitle': '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Las Vegas, NV, USA', 'publisher': 'IEEE', 'volume': '', 'pages': '770-778', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR.2016.90', 'ISBN': '978-1-4673-8851-1', 'shortTitle': '', 'url': 'http://ieeexplore.ieee.org/document/7780459/', 'accessDate': '2025-02-04T07:53:20Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': ['VM4CAJSG'], 'relations': {}, 'dateAdded': '2025-02-04T07:53:20Z', 'dateModified': '2025-02-04T07:53:20Z'}\n",
      "{'key': 'JX8MTRF7', 'version': 3997, 'itemType': 'journalArticle', 'title': 'An efficient CNN-based detector for photovoltaic module cells defect detection in electroluminescence images', 'creators': [{'creatorType': 'author', 'firstName': 'Qing', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Min', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Chenze', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Q. M. Jonathan', 'lastName': 'Wu'}], 'abstractNote': 'Electroluminescence (EL) imaging provides a high spatial resolution for inspecting photovoltaic (PV) cells, enabling the detection of various types of PV cell defects. Recently, convolutional neural network (CNN) based automatic detection methods for PV cell defects using EL images have attracted much attention. However, existing methods struggle to achieve a good balance between detection accuracy and efficiency. To address this issue, we propose a novel method for efficient PV cell defect detection. Firstly, we utilize Contrast Limited Adaptive Histogram Equalization (CLAHE) algorithm to improve EL image contrast, making defect features become more distinguishable. Secondly, we propose a lightweight defect detector using EfficientNet-B0 as its backbone. Moreover, we design a graph channel attention module (GCAM) to improve CNN’s limitation in modeling global information. It executes graph channel reasoning to generate enriched feature representation beyond the local receptive field, which is beneficial for distinguishing PV cell defects with similar local details. Next, we utilize focal loss to train the detector, enhancing its ability to detect challenging defects. Lastly, the proposed method is evaluated on the PVEL dataset and it achieved an accuracy of 97.81%, precision of 97.70%, recall of 97.59%, F1-score of 97.64%, and MCC of 97.32%, demonstrating our method is effective and outperforms state-of-the-art methods across various metrics.', 'publicationTitle': 'Solar Energy', 'volume': '267', 'issue': '', 'pages': '112245', 'date': '2024-01-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Solar Energy', 'language': '', 'DOI': '10.1016/j.solener.2023.112245', 'ISSN': '0038-092X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0038092X23008794', 'accessDate': '2025-01-19T08:38:39Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'CLAHE', 'type': 1}, {'tag': 'Electroluminescence imaging', 'type': 1}, {'tag': 'Focal loss', 'type': 1}, {'tag': 'Graph channel attention module', 'type': 1}, {'tag': 'Lightweight defect detector', 'type': 1}, {'tag': 'Photovoltaic cell defect detection', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-01-19T08:38:39Z', 'dateModified': '2025-01-19T08:38:39Z'}\n",
      "{'key': '27LXG2PB', 'version': 3992, 'itemType': 'conferencePaper', 'title': 'SCConv: Spatial and Channel Reconstruction Convolution for Feature Redundancy', 'creators': [{'creatorType': 'author', 'firstName': 'Jiafeng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Ying', 'lastName': 'Wen'}, {'creatorType': 'author', 'firstName': 'Lianghua', 'lastName': 'He'}], 'abstractNote': 'Convolutional Neural Networks (CNNs) have achieved remarkable performance in various computer vision tasks but this comes at the cost of tremendous computational resources, partly due to convolutional layers extracting redundant features. Recent works either compress well-trained large-scale models or explore well-designed lightweight models. In this paper, we make an attempt to exploit spatial and channel redundancy among features for CNN compression and propose an efficient convolution module, called SCConv (Spatial and Channel reconstruction Convolution), to decrease redundant computing and facilitate representative feature learning. The proposed SCConv consists of two units: spatial reconstruction unit (SRU) and channel reconstruction unit (CRU). SRU utilizes a separate-and-reconstruct method to suppress the spatial redundancy while CRU uses a split-transform-andfuse strategy to diminish the channel redundancy. In addition, SCConv is a plug-and-play architectural unit that can be used to replace standard convolution in various convolutional neural networks directly. Experimental results show that SCConv-embedded models are able to achieve better performance by reducing redundant features with significantly lower complexity and computational costs.', 'date': '6/2023', 'proceedingsTitle': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'conferenceName': '2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)', 'place': 'Vancouver, BC, Canada', 'publisher': 'IEEE', 'volume': '', 'pages': '6153-6162', 'series': '', 'language': 'en', 'DOI': '10.1109/CVPR52729.2023.00596', 'ISBN': '979-8-3503-0129-8', 'shortTitle': 'SCConv', 'url': 'https://ieeexplore.ieee.org/document/10204928/', 'accessDate': '2024-12-24T13:56:18Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': 'https://doi.org/10.15223/policy-029', 'extra': '', 'tags': [{'tag': 'important-model'}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2024-12-24T13:56:18Z', 'dateModified': '2025-01-19T08:00:52Z'}\n",
      "{'key': '8V23V8KI', 'version': 3985, 'itemType': 'journalArticle', 'title': 'Use of Drone and Infrared Camera for a Campus Building Envelope Study', 'creators': [{'creatorType': 'author', 'firstName': 'Raheem', 'lastName': 'Ariwoola'}], 'abstractNote': '', 'publicationTitle': 'Electronic Theses and Dissertations', 'volume': '', 'issue': '', 'pages': '', 'date': '2016-05-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '', 'ISSN': '', 'shortTitle': '', 'url': 'https://dc.etsu.edu/etd/3018', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': '', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:44:47Z', 'dateModified': '2025-01-19T07:17:33Z'}\n",
      "{'key': '588Y45WX', 'version': 3985, 'itemType': 'journalArticle', 'title': 'UAV Remote Sensing Image Dehazing Based on Double-Scale Transmission Optimization Strategy', 'creators': [{'creatorType': 'author', 'firstName': 'Kemeng', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Sijia', 'lastName': 'Ma'}, {'creatorType': 'author', 'firstName': 'Ruohui', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Libao', 'lastName': 'Zhang'}], 'abstractNote': 'Current dehazing methods for unmanned aerial vehicle (UAV) remote sensing images often have texture detail loss and color distortion problems, especially in highlighted regions. This is mainly due to the rich texture and low intensity of UAV remote sensing images being ignored, which results in incorrect transmission estimation. In this letter, we propose a UAV remote sensing image dehazing method based on a double-scale transmission optimization strategy. First, we propose a double-scale optimization strategy to estimate the transmission map with more accurate texture details and color preservation, especially in highlighted regions of hazy UAV images that are most severely distorted. Second, a UAV-adaptive haze-line prior algorithm is proposed to address the large scene depth and low intensity of UAV remote sensing images. Finally, we introduce a luminance-weighted frequency-domain saliency model to avoid texture detail loss and color distortions for better transmission optimization, especially in highlighted regions. Compared with state-of-the-art methods, our method shows better detail performance and visual effects, especially for UAV images with highlighted regions.', 'publicationTitle': 'IEEE Geoscience and Remote Sensing Letters', 'volume': '19', 'issue': '', 'pages': '1-5', 'date': '2022', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/LGRS.2022.3206205', 'ISSN': '1558-0571', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9888129', 'accessDate': '2024-09-29T06:42:04Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Geoscience and Remote Sensing Letters', 'tags': [{'tag': 'Autonomous aerial vehicles', 'type': 1}, {'tag': 'Dehazing', 'type': 1}, {'tag': 'Distortion', 'type': 1}, {'tag': 'Feature extraction', 'type': 1}, {'tag': 'Image color analysis', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Propagation losses', 'type': 1}, {'tag': 'Remote sensing', 'type': 1}, {'tag': 'remote sensing', 'type': 1}, {'tag': 'saliency detection', 'type': 1}, {'tag': 'transmission optimization', 'type': 1}, {'tag': 'unmanned aerial vehicle (UAV)', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:42:04Z', 'dateModified': '2025-01-19T07:17:33Z'}\n",
      "{'key': '64D8V8M5', 'version': 3984, 'itemType': 'conferencePaper', 'title': 'Traffic signal coordination for emergency vehicles', 'creators': [{'creatorType': 'author', 'firstName': 'Wenwen', 'lastName': 'Kang'}, {'creatorType': 'author', 'firstName': 'Gang', 'lastName': 'Xiong'}, {'creatorType': 'author', 'firstName': 'Yisheng', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Xisong', 'lastName': 'Dong'}, {'creatorType': 'author', 'firstName': 'Fenghua', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Qingjie', 'lastName': 'Kong'}], 'abstractNote': 'Reducing travel time of emergency vehicles (EVs) has a potential in significant savings of life and property. Integrating modern intelligent transportation system (ITS) with EV signal preemption seems to be a solution. But existing EV signal preemption systems often break the current signal coordination and impact a lot on the normal traffic streams. In this paper we propose an emergency vehicle signal coordination (EVSC) approach, which is intended to provide “green wave” for EVs. Traffic simulations are conducted along an emergency corridor with 8 intersections in Qingdao, China. Multiple traffic measurements are compared between simulation outputs with and without EVSC operation. The result indicates that the proposed approach can reduce EV travel time by 26.9% without too much negative impact on the normal traffic streams.', 'date': '2014-10', 'proceedingsTitle': '17th International IEEE Conference on Intelligent Transportation Systems (ITSC)', 'conferenceName': '17th International IEEE Conference on Intelligent Transportation Systems (ITSC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '157-161', 'series': '', 'language': '', 'DOI': '10.1109/ITSC.2014.6957683', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/6957683', 'accessDate': '2024-08-13T10:29:32Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2153-0017', 'tags': [{'tag': 'Cities and towns', 'type': 1}, {'tag': 'Delays', 'type': 1}, {'tag': 'Mathematical model', 'type': 1}, {'tag': 'Roads', 'type': 1}, {'tag': 'Traffic control', 'type': 1}, {'tag': 'Vehicles', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-13T10:29:32Z', 'dateModified': '2025-01-19T07:17:32Z'}\n",
      "{'key': '8I7U89BB', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Safety Performance of Unsignalized Median U-Turn Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Jonathan', 'lastName': 'Kay'}, {'creatorType': 'author', 'firstName': 'Timothy', 'lastName': 'Gates'}, {'creatorType': 'author', 'firstName': 'Peter', 'lastName': 'Savolainen'}, {'creatorType': 'author', 'firstName': 'Md Shakir', 'lastName': 'Mahmud'}], 'abstractNote': 'Alternative intersection designs can offer safety and operational benefits with potentially lower costs than conventional intersections when implemented in the proper setting. The Federal Highway Administration has previously identified a subset of alternative designs called reduced left-turn conflict intersections as a proven safety countermeasure. Median U-turn intersections (also known as “Michigan lefts” or “boulevard turnarounds”) are one such design that accommodates all left-turn movements via directional U-turn crossovers within the median. Prior work has consistently shown that median U-turn intersections can provide superior safety performance when used in the appropriate conditions. However, research that is specific to unsignalized reduced left-turn conflict intersections continues to be limited to work conducted before the Highway Safety Manual, or which includes restricted crossing U-turn intersections. This study included the evaluation of historical traffic crashes and volume data at 95 unsignalized intersections in the state of Michigan. This included the collection of data for 39 median U-turn sites and 56 reference group sites to estimate safety performance functions and crash modification factors that can be used when considering future conversions. Ultimately, crash modification factors for fatal and injury crashes of 0.438 and 0.686 are recommended when converting intersections with undivided two-lane two-way major approaches and four-lane divided boulevard major approaches, respectively. Although there was no significant difference in property damage only crashes associated with converting intersections with undivided, two-lane, two-way major approaches, a crash modification factor of 1.325 is recommended for property damage only crashes specific to conversions with four-lane, divided boulevard major approaches.', 'publicationTitle': 'Transportation Research Record: Journal of the Transportation Research Board', 'volume': '2676', 'issue': '', 'pages': '036119812210869', 'date': '2022-04-29', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Record: Journal of the Transportation Research Board', 'language': '', 'DOI': '10.1177/03611981221086936', 'ISSN': '', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ResearchGate', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-15T07:22:58Z', 'dateModified': '2025-01-19T07:17:32Z'}\n",
      "{'key': 'GAHIMXS5', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Safety Performance of Unsignalized Median U-Turn Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Jonathan', 'lastName': 'Kay'}, {'creatorType': 'author', 'firstName': 'Timothy', 'lastName': 'Gates'}, {'creatorType': 'author', 'firstName': 'Peter', 'lastName': 'Savolainen'}, {'creatorType': 'author', 'firstName': 'Md Shakir', 'lastName': 'Mahmud'}], 'abstractNote': 'Alternative intersection designs can offer safety and operational benefits with potentially lower costs than conventional intersections when implemented in the proper setting. The Federal Highway Administration has previously identified a subset of alternative designs called reduced left-turn conflict intersections as a proven safety countermeasure. Median U-turn intersections (also known as “Michigan lefts” or “boulevard turnarounds”) are one such design that accommodates all left-turn movements via directional U-turn crossovers within the median. Prior work has consistently shown that median U-turn intersections can provide superior safety performance when used in the appropriate conditions. However, research that is specific to unsignalized reduced left-turn conflict intersections continues to be limited to work conducted before the Highway Safety Manual, or which includes restricted crossing U-turn intersections. This study included the evaluation of historical traffic crashes and volume data at 95 unsignalized intersections in the state of Michigan. This included the collection of data for 39 median U-turn sites and 56 reference group sites to estimate safety performance functions and crash modification factors that can be used when considering future conversions. Ultimately, crash modification factors for fatal and injury crashes of 0.438 and 0.686 are recommended when converting intersections with undivided two-lane two-way major approaches and four-lane divided boulevard major approaches, respectively. Although there was no significant difference in property damage only crashes associated with converting intersections with undivided, two-lane, two-way major approaches, a crash modification factor of 1.325 is recommended for property damage only crashes specific to conversions with four-lane, divided boulevard major approaches.', 'publicationTitle': 'Transportation Research Record: Journal of the Transportation Research Board', 'volume': '2676', 'issue': '', 'pages': '036119812210869', 'date': '2022-04-29', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Record: Journal of the Transportation Research Board', 'language': '', 'DOI': '10.1177/03611981221086936', 'ISSN': '', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ResearchGate', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-15T07:13:51Z', 'dateModified': '2025-01-19T07:17:32Z'}\n",
      "{'key': 'UZSVZPAA', 'version': 3990, 'itemType': 'journalArticle', 'title': 'PVF-10: A high-resolution unmanned aerial vehicle thermal infrared image dataset for fine-grained photovoltaic fault classificationPVF-10： 用于精细光伏故障分类的高分辨率无人机热红外图像数据集', 'creators': [{'creatorType': 'author', 'firstName': 'Bo', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Qi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Mengmeng', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yuntian', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhengjia', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xiuguo', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Gao'}, {'creatorType': 'author', 'firstName': 'Yanzhen', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Haoran', 'lastName': 'Zhang'}], 'abstractNote': 'Accurate identification of faulty photovoltaic (PV) modules is crucial for the effective operation and maintenance of PV systems. Deep learning (DL) algorithms exhibit promising potential for classifying PV fault (PVF) from thermal infrared (TIR) images captured by unmanned aerial vehicle (UAV), contingent upon the availability of extensive and high-quality labeled data. However, existing TIR PVF datasets are limited by low image resolution and incomplete coverage of fault types. This study proposes a high-resolution TIR PVF dataset with 10 classes, named PVF-10, comprising 5579 cropped images of PV panels collected from 8 PV power plants. These classes are further categorized into two groups according to the repairability of PVF, with 5 repairable and 5 irreparable classes each. Additionally, the circuit mechanisms underlying the TIR image features of typical PVF types are analyzed, supported by high-resolution images, thereby providing comprehensive information for PV operators. Finally, five state-of-the-art DL algorithms are trained and validated based on the PVF-10 dataset using three levels of resampling strategy. The results show that the overall accuracy (OA) of these algorithms exceeds 83%, with the highest OA reaching 93.32%. Moreover, the preprocessing procedure involving resampling and padding strategies are beneficial for improving PVF classification accuracy using PVF-10 datasets. The developed PVF-10 dataset is expected to stimulate further research and innovation in PVF classification.', 'publicationTitle': 'Applied Energy', 'volume': '376', 'issue': '', 'pages': '124187', 'date': '2024-12-15', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2024.124187', 'ISSN': '0306-2619', 'shortTitle': 'PVF-10', 'url': 'https://www.sciencedirect.com/science/article/pii/S0306261924015708', 'accessDate': '2024-11-22T10:15:58Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Classification', 'type': 1}, {'tag': 'Deep learning', 'type': 1}, {'tag': 'Photovoltaic fault', 'type': 1}, {'tag': 'Thermal infrared data', 'type': 1}, {'tag': 'Unmanned aerial vehicle', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2024-11-22T10:15:58Z', 'dateModified': '2025-01-19T07:17:31Z'}\n",
      "{'key': 'NSABBZL9', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Optimal time trajectory and coordination for connected and automated vehicles', 'creators': [{'creatorType': 'author', 'firstName': 'Andreas A.', 'lastName': 'Malikopoulos'}, {'creatorType': 'author', 'firstName': 'Logan', 'lastName': 'Beaver'}, {'creatorType': 'author', 'firstName': 'Ioannis Vasileios', 'lastName': 'Chremos'}], 'abstractNote': 'In this paper, we provide a decentralized theoretical framework for coordination of connected and automated vehicles (CAVs) at different traffic scenarios. The framework includes: (1) an upper-level optimization that yields for each CAV its optimal time trajectory and lane to pass through a given traffic scenario while alleviating congestion; and (2) a low-level optimization that yields for each CAV its optimal control input (acceleration/deceleration). We provide a complete, analytical solution of the low-level optimization problem that includes the rear-end, speed-dependent safety constraint. Furthermore, we provide a problem formulation for the upper-level optimization in which there is no duality gap. The latter implies that the optimal time trajectory for each CAV does not activate any of the state, control, and safety constraints of the low-level optimization, thus allowing for online implementation. Finally, we present a geometric duality framework with hyperplanes to derive the condition under which the optimal solution of the upper-level optimization always exists. We validate the effectiveness of the proposed theoretical framework through simulation.', 'publicationTitle': 'Automatica', 'volume': '125', 'issue': '', 'pages': '109469', 'date': '2021-03-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Automatica', 'language': '', 'DOI': '10.1016/j.automatica.2020.109469', 'ISSN': '0005-1098', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0005109820306671', 'accessDate': '2024-07-04T16:03:51Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Autonomous intersections', 'type': 1}, {'tag': 'Connected and automated vehicles', 'type': 1}, {'tag': 'Cyber–physical systems', 'type': 1}, {'tag': 'Decentralized optimal control', 'type': 1}, {'tag': 'Emerging mobility', 'type': 1}, {'tag': 'Path planning', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T16:03:51Z', 'dateModified': '2025-01-19T07:17:31Z'}\n",
      "{'key': '4VLG85BI', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Optimal traffic operation for maximum energy efficiency in signal-free urban networks: A macroscopic analytical approach', 'creators': [{'creatorType': 'author', 'firstName': 'Mahyar', 'lastName': 'Amirgholy'}, {'creatorType': 'author', 'firstName': 'H. Oliver', 'lastName': 'Gao'}], 'abstractNote': 'The integration of artificial intelligence and wireless communication technologies in communicant autonomous vehicles (CAVs) enables coordinating the movement of CAV platoons at signal-free intersections. The capacity of signal-free intersections can be significantly improved by adjusting traffic variables at a macroscopic scale; however, the resulting improvement in the capacity does not necessarily have a positive impact on the energy consumption of CAVs at the network level. In this research, we develop an analytical model to enhance energy efficiency by optimizing macroscopic traffic variables in signal-free networks. To this end, we adopt a macroscopic modeling approach to estimate the operational capacity by accounting for the stochasticity resulting from the error in synchronizing the arrival and departure of consecutive platoons in crossing directions at intersections. We also develop a macrolevel analytical model to estimate expected energy loss during the acceleration/deceleration maneuver required for resynchronization at intersections as a function of synchronization success probability. We then maximize energy efficiency by minimizing expected energy loss and maximizing expected capacity in a biobjective optimization framework. We solve the energy efficiency problem using an analytical approach to derive a closed-form solution for the optimal traffic speed and the length of the marginal gap between the passage of consecutive platoons in crossing directions through intersections for a (general) normal distribution of the operational error. Having the closed-form solution of the energy efficiency problem, we balance the trade-off between energy loss and operational capacity at a large scale by extending the analytical model to the network level using the Macroscopic Fundamental Diagram (MFD) concept. The results of our two-ring simulation model indicate the accuracy of the proposed analytical model in estimating the macroscopic relationship between the expected energy loss at intersections and the vehicular density in signal-free networks. Our numerical results also show that optimizing the traffic speed and marginal gap length can improve energy efficiency by 31% at the cost of a 16% decrease in maximum capacity.', 'publicationTitle': 'Applied Energy', 'volume': '329', 'issue': '', 'pages': '120128', 'date': '2023-01-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2022.120128', 'ISSN': '0306-2619', 'shortTitle': 'Optimal traffic operation for maximum energy efficiency in signal-free urban networks', 'url': 'https://www.sciencedirect.com/science/article/pii/S030626192201385X', 'accessDate': '2024-07-04T16:07:37Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Autonomous vehicles', 'type': 1}, {'tag': 'Biobjective optimization', 'type': 1}, {'tag': 'Closed-form solution', 'type': 1}, {'tag': 'Energy efficiency', 'type': 1}, {'tag': 'Macroscopic fundamental diagram', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T16:07:37Z', 'dateModified': '2025-01-19T07:17:31Z'}\n",
      "{'key': 'HY7IF2P3', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Optimal Coordination of Platoons of Connected and Automated Vehicles at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Sharmila Devi', 'lastName': 'Kumaravel'}, {'creatorType': 'author', 'firstName': 'Andreas A.', 'lastName': 'Malikopoulos'}, {'creatorType': 'author', 'firstName': 'Ramakalyan', 'lastName': 'Ayyagari'}], 'abstractNote': 'In this paper, we address the problem of coordinating platoons of connected and automated vehicles crossing a signal-free intersection. We present a decentralized, two-level optimal framework to coordinate the platoons with the objective to minimize travel delay and fuel consumption of every platoon crossing the intersection. At the upper-level, each platoon leader derives a proven optimal schedule to enter the intersection. At the low-level, the platoon leader derives their optimal control input (acceleration/deceleration) for the optimal schedule derived in the upper-level. We validate the effectiveness of the proposed framework in simulation and show significant improvements both in travel delay and fuel consumption compared to the baseline scenarios where platoons enter the intersection based on first-come-first-serve and longest queue first - maximum weight matching scheduling algorithms.', 'publicationTitle': 'IEEE Transactions on Intelligent Vehicles', 'volume': '7', 'issue': '2', 'pages': '186-197', 'date': '2022-06', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TIV.2021.3096993', 'ISSN': '2379-8904', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9484798', 'accessDate': '2024-07-05T01:38:21Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Intelligent Vehicles', 'tags': [{'tag': 'Delays', 'type': 1}, {'tag': 'Fuels', 'type': 1}, {'tag': 'Merging', 'type': 1}, {'tag': 'Optimal control', 'type': 1}, {'tag': 'Optimal scheduling', 'type': 1}, {'tag': 'Platoons coordination', 'type': 1}, {'tag': 'Schedules', 'type': 1}, {'tag': 'Scheduling', 'type': 1}, {'tag': 'connected and automated vehicles', 'type': 1}, {'tag': 'intersection control', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T01:38:21Z', 'dateModified': '2025-01-19T07:17:30Z'}\n",
      "{'key': '2XZEY3RY', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Optimal Control for Connected and Autonomous Vehicles at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Boli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Simos A.', 'lastName': 'Evangelou'}, {'creatorType': 'author', 'firstName': 'Stelios', 'lastName': 'Timotheou'}], 'abstractNote': 'The development of connected and autonomous vehicles (CAVs) is one of the central aspects in the pathway towards future intelligent mobility systems. This paper addresses the problem of coordinating CAVs crossing an uncontrolled intersection so as to maintain safe and efficient traffic flow. The proposed control strategy is based on an optimal control framework that is formulated to minimize a weighted sum of total energy consumption and travel time of all CAVs by finding the optimal velocity trajectory of each vehicle. The design procedure starts with a proper formulation of the autonomous intersection crossing problem for CAVs, with various cases of energy recovery capability by the CAVs considered, to also investigate the influence of powertrain electrification on the intersection crossing problem. This yields an optimal control problem (OCP) with nonlinear and nonconvex dynamics and constraints. In order to ensure a rapid solution search and a unique global optimum, the OCP is reformulated via convex modeling techniques. Numerical results validate the effectiveness of the proposed approaches, while the trade-off between energy consumption and travel time is illustrated by Pareto optimal solutions.', 'publicationTitle': 'IFAC-PapersOnLine', 'volume': '53', 'issue': '2', 'pages': '15306-15311', 'date': '2020-01-01', 'series': '21st IFAC World Congress', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'IFAC-PapersOnLine', 'language': '', 'DOI': '10.1016/j.ifacol.2020.12.2336', 'ISSN': '2405-8963', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S2405896320330056', 'accessDate': '2024-07-05T01:20:45Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Connected', 'type': 1}, {'tag': 'Energy consumption', 'type': 1}, {'tag': 'Intersections crossing', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Velocity control', 'type': 1}, {'tag': 'autonomous vehicles', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T01:20:45Z', 'dateModified': '2025-01-19T07:17:30Z'}\n",
      "{'key': 'FAEU3M2G', 'version': 3984, 'itemType': 'conferencePaper', 'title': 'Optimal Motion Control for Connected and Automated Electric Vehicles at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Boli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Simos A.', 'lastName': 'Evangelou'}, {'creatorType': 'author', 'firstName': 'Stelios', 'lastName': 'Timotheou'}], 'abstractNote': 'Traffic congestion is one of the major issues for urban traffic networks. The connected and autonomous vehicles (CAV) is an emerging technology that has the potential to address this issue by improving safety, efficiency, and capacity of the transportation system. In this paper, the problem of optimal trajectory planning of battery-electric CAVs in the context of cooperative crossing of an unsignalized intersection is addressed. An optimization-based centralized intersection controller is proposed to find the optimal velocity trajectory of each vehicle so as to minimize electric energy consumption and traffic throughput. Solving the underlying optimization problem for a group of CAVs is not straightforward because of the nonlinear and nonconvex dynamics, especially when the powertrain model is explicitly modelled. In order to ensure a rapid solution search and a unique global optimum, the optimal control problem (OCP) is reformulated via convex modeling techniques. Several simulation case studies show the effectiveness of the proposed approach and the trade-off between energy consumption and traffic throughput is illustrated.', 'date': '2020-12', 'proceedingsTitle': '2020 59th IEEE Conference on Decision and Control (CDC)', 'conferenceName': '2020 59th IEEE Conference on Decision and Control (CDC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '2831-2836', 'series': '', 'language': '', 'DOI': '10.1109/CDC42340.2020.9304392', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9304392', 'accessDate': '2024-07-05T02:46:56Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2576-2370', 'tags': [{'tag': 'Batteries', 'type': 1}, {'tag': 'Energy consumption', 'type': 1}, {'tag': 'Mechanical power transmission', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Torque', 'type': 1}, {'tag': 'Trajectory', 'type': 1}, {'tag': 'Vehicle dynamics', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T02:46:56Z', 'dateModified': '2025-01-19T07:17:30Z'}\n",
      "{'key': 'ADZ55MFZ', 'version': 3984, 'itemType': 'journalArticle', 'title': 'Optimal Cooperative Driving at Signal-Free Intersections With Polynomial-Time Complexity', 'creators': [{'creatorType': 'author', 'firstName': 'Huaxin', 'lastName': 'Pei'}, {'creatorType': 'author', 'firstName': 'Yuxiao', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Shuo', 'lastName': 'Feng'}], 'abstractNote': 'Cooperative driving at signal-free intersections, which aims to improve driving safety and efficiency for connected and automated vehicles, has attracted increasing interest in recent years. However, existing cooperative driving strategies either suffer from computational complexity or cannot guarantee global optimality. To fill this research gap, this paper proposes an optimal and computationally efficient cooperative driving strategy with the polynomial-time complexity. By modeling the conflict relations among the vehicles, the solution space of the cooperative driving problem is completely represented by a newly designed small-size state space. Then, based on dynamic programming, the globally optimal solution can be searched inside the state space efficiently. It is proved that the proposed strategy can reduce the time complexity of computation from exponential to a small-degree polynomial. Simulation results further demonstrate that the proposed strategy can obtain the globally optimal solution within a limited computation time under various traffic demand settings.', 'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems', 'volume': '23', 'issue': '8', 'pages': '12908-12920', 'date': '2022-08', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TITS.2021.3118592', 'ISSN': '1558-0016', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9569746', 'accessDate': '2024-07-05T01:23:00Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Intelligent Transportation Systems', 'tags': [{'tag': 'Connected and automated vehicles', 'type': 1}, {'tag': 'Dynamic programming', 'type': 1}, {'tag': 'Merging', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Simulation', 'type': 1}, {'tag': 'Space vehicles', 'type': 1}, {'tag': 'Time complexity', 'type': 1}, {'tag': 'Vehicle dynamics', 'type': 1}, {'tag': 'cooperative driving', 'type': 1}, {'tag': 'dynamic programming', 'type': 1}, {'tag': 'signal-free intersection', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T01:23:00Z', 'dateModified': '2025-01-19T07:17:30Z'}\n",
      "{'key': '9VUFCTX7', 'version': 3993, 'itemType': 'journalArticle', 'title': 'Modeling vehicle U-turning behavior near intersections: A deep learning approach based on TCN and multi-head attention', 'creators': [{'creatorType': 'author', 'firstName': 'Weiliang', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Qinyong', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Boyang', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Chujun', 'lastName': 'Peng'}, {'creatorType': 'author', 'firstName': 'Rong', 'lastName': 'Yu'}], 'abstractNote': 'In U-turn bays near intersections, the conflict between U-turning vehicles and those going straight-ahead results in traffic accidents since straight-ahead vehicles cannot reliably anticipate the behavior of oncoming U-turning vehicles. However, previous studies on modeling U-turning behavior do not effectively capture the spatial–temporal interaction between the U-turning and surrounding vehicles. To address this issue, a deep-learning framework based on a temporal convolutional network (TCN) and multi-head attention mechanism is developed. The TCN is utilized to capture long-term dependencies of vehicles in the shared left- and U-turn lanes by extracting vehicle historical motion features. The self-attention mechanism extracts salient features related to the U-turn intentions, classifying the vehicles into left- and U-turning vehicles based on their driving intentions. A parallel TCN and spatial multi-head attention structure is constructed to model vehicle–vehicle interactions to further predict the future trajectory of U-turning vehicles. Finally, the obtained features are input into a Transformer-based decoder module and trajectory generator to predict the future displacement and body orientation of U-turning vehicles. The model is validated via comparison with state-of-the-art models and the observed trajectories under various scenarios. Ablation studies are conducted to quantify the efficacy of each module. Further, the effect of the surrounding homogenous and heterogeneous vehicles on U-turning vehicles in four different U-turn scenarios is quantified using spatial–temporal variation graphs and attention matrices.', 'publicationTitle': 'Expert Systems with Applications', 'volume': '249', 'issue': '', 'pages': '123674', 'date': '2024-09-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Expert Systems with Applications', 'language': '', 'DOI': '10.1016/j.eswa.2024.123674', 'ISSN': '0957-4174', 'shortTitle': 'Modeling vehicle U-turning behavior near intersections', 'url': 'https://www.sciencedirect.com/science/article/pii/S0957417424005402', 'accessDate': '2024-08-09T10:25:28Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Deep learning', 'type': 1}, {'tag': 'Driving intention', 'type': 1}, {'tag': 'Multi-head attention mechanism', 'type': 1}, {'tag': 'Trajectory prediction', 'type': 1}, {'tag': 'Vehicle U-turning behavior', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-09T10:25:28Z', 'dateModified': '2025-01-19T07:17:29Z'}\n",
      "{'key': '7XIP98H7', 'version': 3996, 'itemType': 'journalArticle', 'title': 'Methods of photovoltaic fault detection and classification: A review', 'creators': [{'creatorType': 'author', 'firstName': 'Ying-Yi', 'lastName': 'Hong'}, {'creatorType': 'author', 'firstName': 'Rolando A.', 'lastName': 'Pula'}], 'abstractNote': 'Photovoltaic (PV) fault detection and classification are essential in maintaining the reliability of the PV system (PVS). Various faults may occur in either DC or AC side of the PVS. The detection, classification, and localization of such faults are essential for mitigation, accident prevention, reduction of the loss of generated energy, and revenue. In recent years, the number of works of PV fault detection and classification has significantly increased. These works have been reviewed by considering the categorization of detection and classification techniques. This paper improves of the categorization of methods to study the faulty PVS by considering visual and thermal method and electrical based method. Moreover, an effort is made to list all potential faults in a PVS in both the DC and AC sides. Specific PV fault detection and classification techniques are also enumerated. A possible direction for research on the PV fault detection and classification, such as quantum machine learning, internet of things, and cloud/edge computing technologies, is suggested as a guide for future emerging technologies.', 'publicationTitle': 'Energy Reports', 'volume': '8', 'issue': '', 'pages': '5898-5929', 'date': '2022-11-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Energy Reports', 'language': '', 'DOI': '10.1016/j.egyr.2022.04.043', 'ISSN': '2352-4847', 'shortTitle': 'Methods of photovoltaic fault detection and classification', 'url': 'https://www.sciencedirect.com/science/article/pii/S2352484722008022', 'accessDate': '2025-01-18T15:06:54Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Classification', 'type': 1}, {'tag': 'Detection', 'type': 1}, {'tag': 'Electrical based method', 'type': 1}, {'tag': 'Fault', 'type': 1}, {'tag': 'Photovoltaic system', 'type': 1}, {'tag': 'Visual and thermal method', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-01-18T15:06:54Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': '6H764JJW', 'version': 3982, 'itemType': 'journalArticle', 'title': 'Evaluating the Impacts of Different Exit Strategies of Emergency Vehicle Preemption on Arterial Signal Coordination: A Case Study in Reno, Nevada', 'creators': [{'creatorType': 'author', 'firstName': 'Jianyuan', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Zong', 'lastName': 'Tian'}, {'creatorType': 'author', 'firstName': 'Aobo', 'lastName': 'Wang'}], 'abstractNote': 'AbstractEmergency vehicle preemption (EVP), a common traffic signal preemption in urban areas,\\nis used to prioritize the right-of-way to emergency vehicles at signalized intersections\\nby terminating active signal timing plans and running preemption plans. ...Practical ApplicationsTraffic signal preemption systems play an essential role in emergency response management\\nin terms of shortened response times, improved traffic safety, and potential cost\\nsavings. As one of the common traffic signal preemptions in ...', 'publicationTitle': 'Journal of Transportation Engineering, Part A: Systems', 'volume': '149', 'issue': '11', 'pages': '05023007', 'date': '2023/11/01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'EN', 'DOI': '10.1061/JTEPBS.TEENG-7819', 'ISSN': '', 'shortTitle': 'Evaluating the Impacts of Different Exit Strategies of Emergency Vehicle Preemption on Arterial Signal Coordination', 'url': 'https://ascelibrary.org/doi/10.1061/JTEPBS.TEENG-7819', 'accessDate': '2024-08-29T12:46:58Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ASCE', 'callNumber': '', 'rights': '© 2023 American Society of Civil Engineers', 'extra': 'Publisher: American Society of Civil Engineers', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-29T12:46:58Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': 'H4YVAUQX', 'version': 3982, 'itemType': 'journalArticle', 'title': 'Merging process of U-turns at uncontrolled median openings under mixed traffic conditions', 'creators': [{'creatorType': 'author', 'firstName': 'Gourab', 'lastName': 'Sil'}, {'creatorType': 'author', 'firstName': 'Smruti Sourava', 'lastName': 'Mohapatra'}, {'creatorType': 'author', 'firstName': 'Partha Pratim', 'lastName': 'Dey'}, {'creatorType': 'author', 'firstName': 'Satish', 'lastName': 'Chandra'}], 'abstractNote': 'At an uncontrolled median opening, the limited priority situation and the high degree of heterogeneity in traffic stream make the merging manoeuvre of U-turning vehicles very much complex. This study is an attempt to understand this merging manoeuvre. The different types of merging manoeuvres have been identified in the field and accordingly classified into different categories. Depending upon the number of vehicles that can merge all together into the opposing through traffic by accepting a single gap, the merging has been classified into two types: single entry merging and multiple entry merging. On the other hand, based on the situation of priority of movement, the merging process is divided into another two categories: ideal merging and forced merging. More explicitly, the ideal merging is split into free merging and Swift Merging (SM). In addition, the forced entry merging is categorized into Gradual Merging (GM) and Aggressive Merging (AM). Time distance diagrams for different types of merging are presented for their better understanding. Field data collected at seven median openings located on various 6-lane divided urban roads are used to analyse different types of merging in a mixed traffic situation. All vehicles plying on the road are divided into 5 categories such as car, motorized two-wheeler (2-W), motorized three-wheeler (3-W), Sports Utility Vehicle (SUV), and Light Commercial Vehicle (LCV) and the merging behaviour of these categories of vehicles have been studied. The effect of influencing parameters like opposing traffic volume and delay on merging are investigated. Mathematical relations are developed between Merging Time (MT) of a vehicle type and the opposing traffic volume. To address the effect of Service Delay (SD) on the MT of a vehicle, models are proposed between SD and MT for all the five categories of vehicles. The two types of merging; gradual and swift are prominently observed in field. The time required by different categories of vehicles for these two merging at various traffic volume levels are determined. Finally, two-tailed t-test is conducted to see if the MT for the two different types of merging is statistically different.\\nFirst published online 26 October 2016\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tKeyword : \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmerging, \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmedian opening, \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tmixed traffic, \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tservice delay, \\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\topposing traffic\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\tHow to Cite\\n\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n  Sil, G., Mohapatra, S. S., Dey, P. P., & Chandra, S. (2018). Merging process of U-turns at uncontrolled median openings under mixed traffic conditions. Transport, 33(2), 370–379. https://doi.org/10.3846/16484142.2016.1247295\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tMore Citation Formats\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tACM\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tACS\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tAPA\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tABNT\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tChicago\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tHarvard\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tIEEE\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tMLA\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tTurabian\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tVancouver', 'publicationTitle': 'Transport', 'volume': '33', 'issue': '2', 'pages': '370-379', 'date': '2018-01-26', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3846/16484142.2016.1247295', 'ISSN': '1648-3480', 'shortTitle': '', 'url': 'https://journals.vilniustech.lt/index.php/Transport/article/view/193', 'accessDate': '2024-07-17T07:34:20Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'journals.vilniustech.lt', 'callNumber': '', 'rights': 'Copyright (c) 2016 The Author(s). Published by Vilnius Gediminas Technical University.', 'extra': 'Number: 2', 'tags': [{'tag': 'median opening', 'type': 1}, {'tag': 'merging', 'type': 1}, {'tag': 'mixed traffic', 'type': 1}, {'tag': 'opposing traffic', 'type': 1}, {'tag': 'service delay', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-17T07:34:20Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': 'VWIHH3M2', 'version': 3981, 'itemType': 'conferencePaper', 'title': 'Latency-Robust Control of High-Speed Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Zev', 'lastName': 'Nicolai-Scanio'}, {'creatorType': 'author', 'firstName': 'Zhong-Ping', 'lastName': 'Jiang'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Jin'}], 'abstractNote': \"High-speed signal-free intersections are a novel urban traffic operation enabled by connected and autonomous vehicles. However, the impact of communication latency on intersection performance has not been well understood. In this paper, we consider vehicle coordination at signal-free intersections with latency. We focus on two questions: (i) how to ensure latency-resiliency of the coordination algorithm, and (ii) how latency affects the intersection's capacity. We consider a trajectory-based model with bounded speed uncertainties. Latency leads to uncertain state observation. We propose a piecewise-linear control law that ensures safety (avoidance of interference) as long as the initial condition is safe. We also analytically quantify the throughput that the proposed control can attain in the face of latency.\", 'date': '2021-05', 'proceedingsTitle': '2021 American Control Conference (ACC)', 'conferenceName': '2021 American Control Conference (ACC)', 'place': '', 'publisher': '', 'volume': '', 'pages': '2935-2942', 'series': '', 'language': '', 'DOI': '10.23919/ACC50511.2021.9482689', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9482689', 'accessDate': '2024-07-05T03:30:28Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 2378-5861', 'tags': [{'tag': 'Autonomous vehicles', 'type': 1}, {'tag': 'Faces', 'type': 1}, {'tag': 'Interference', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Signal-free intersections', 'type': 1}, {'tag': 'Throughput', 'type': 1}, {'tag': 'Uncertainty', 'type': 1}, {'tag': 'connected and autonomous vehicles', 'type': 1}, {'tag': 'robust control', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:30:28Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': '7WJA3MXM', 'version': 3981, 'itemType': 'journalArticle', 'title': 'Intelligent vehicle control at signal-free intersection under mixed connected environment', 'creators': [{'creatorType': 'author', 'firstName': 'Hao', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Ken', 'lastName': 'Oguchi'}], 'abstractNote': '<em>IET Intelligent Transport Systems</em> is an interdisciplinary journal publishing research on the practical applications of intelligent transport systems and infrastructure.', 'publicationTitle': 'IET Intelligent Transport Systems', 'volume': '14', 'issue': '2', 'pages': '82-90', 'date': '2020/02/01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.1049/iet-its.2019.0175', 'ISSN': '1751-9578', 'shortTitle': '', 'url': 'https://ietresearch.onlinelibrary.wiley.com/doi/10.1049/iet-its.2019.0175', 'accessDate': '2024-07-05T02:41:50Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ietresearch.onlinelibrary.wiley.com', 'callNumber': '', 'rights': '', 'extra': 'Publisher: John Wiley & Sons, Ltd', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T02:41:50Z', 'dateModified': '2025-01-19T07:17:28Z'}\n",
      "{'key': 'Z43DSDFQ', 'version': 3982, 'itemType': 'journalArticle', 'title': 'Emergency vehicle route oriented signal coordinated control model with two-level programming', 'creators': [{'creatorType': 'author', 'firstName': 'Jiao', 'lastName': 'Yao'}, {'creatorType': 'author', 'firstName': 'Kaimin', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Yuanyuan', 'lastName': 'Yang'}, {'creatorType': 'author', 'firstName': 'Jin', 'lastName': 'Wang'}], 'abstractNote': 'To minimize travel time of emergency vehicles on the way and improve efficiency of emergency response, an emergency vehicle route oriented signal coordinated control model with two-level programming was proposed based on the different priority types and priority levels of emergency vehicles. The upper level is the dynamic offset model of emergency vehicles, and the lower level is the green wave model of emergency vehicles. At dynamic offset level, latter phase was calculated based on the queue length ahead of the emergency vehicles and their arrival time, in which the former phase was the reference object. At route green wave level, maximum bandwidth of the route of emergency vehicles was calculated, based on the turning movement characteristics and its corresponding capacity reduction. Furthermore, the two-level programming model solution is obtained with genetic algorithm. Finally, simulation results of three control strategies, which are no-signal priority control strategy, isolated control priority strategy and coordinated priority control strategy in this paper, were obtained in micro-traffic simulation software VISSIM, with the case including three intersections in Suzhou roads as the emergency vehicles route. From the simulation results we can conclude that compared to no-signal priority control strategy, coordinated priority strategy can reduce delay, travel time, queue length and stops of emergency vehicles by 27,18, 36 and 38%, respectively, and the average delay of total vehicles at intersection can be reduced by 20%; compared to isolated control priority strategy, these numbers are 14, 6, 12, 21 and 22%, respectively, which means great improvement, and influence on social background traffic was also considered in it.', 'publicationTitle': 'Soft Computing', 'volume': '22', 'issue': '13', 'pages': '4283-4294', 'date': '2018-07-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Soft Comput', 'language': 'en', 'DOI': '10.1007/s00500-017-2826-x', 'ISSN': '1433-7479', 'shortTitle': '', 'url': 'https://doi.org/10.1007/s00500-017-2826-x', 'accessDate': '2024-08-28T07:55:42Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'Springer Link', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Artificial Intelligence', 'type': 1}, {'tag': 'Coordinated control', 'type': 1}, {'tag': 'Dynamic offset of phase', 'type': 1}, {'tag': 'Emergency traffic', 'type': 1}, {'tag': 'Genetic algorithm', 'type': 1}, {'tag': 'Green wave', 'type': 1}, {'tag': 'Route', 'type': 1}, {'tag': 'Two-level programming', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-28T07:55:42Z', 'dateModified': '2025-01-19T07:17:27Z'}\n",
      "{'key': 'T3ZPBLFF', 'version': 3981, 'itemType': 'journalArticle', 'title': 'Distributed conflict-free cooperation for multiple connected vehicles at unsignalized intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Biao', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Shengbo Eben', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yougang', 'lastName': 'Bian'}, {'creatorType': 'author', 'firstName': 'Shen', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Xuegang Jeff', 'lastName': 'Ban'}, {'creatorType': 'author', 'firstName': 'Jianqiang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Keqiang', 'lastName': 'Li'}], 'abstractNote': 'Connected vehicles will change the modes of future transportation management and organization, especially at intersections. In this paper, we propose a distributed conflict-free cooperation method for multiple connected vehicles at unsignalized intersections. We firstly project the approaching vehicles from different traffic movements into a virtual lane and introduce a conflict-free geometry topology considering the conflict relationship of involved vehicles, thus constructing a virtual platoon. Then we present the modeling of communication topology to describe two modes of information transmission between vehicles. Finally, a distributed controller is designed to stabilize the virtual platoon for conflict-free cooperation at intersections. Numerical simulations validate the effectiveness of this method.', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '93', 'issue': '', 'pages': '322-334', 'date': '2018-08-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Part C: Emerging Technologies', 'language': '', 'DOI': '10.1016/j.trc.2018.06.004', 'ISSN': '0968-090X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X18308246', 'accessDate': '2024-07-05T04:27:33Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Connected and automated vehicle', 'type': 1}, {'tag': 'Cooperative control', 'type': 1}, {'tag': 'Unsignalized intersection', 'type': 1}, {'tag': 'Virtual platoon', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T04:27:33Z', 'dateModified': '2025-01-19T07:17:27Z'}\n",
      "{'key': 'V4P85YIY', 'version': 3983, 'itemType': 'conferencePaper', 'title': 'Development of Autonomous Drones for Adaptive Obstacle Avoidance in Real World Environments', 'creators': [{'creatorType': 'author', 'firstName': 'Arne', 'lastName': 'Devos'}, {'creatorType': 'author', 'firstName': 'Emad', 'lastName': 'Ebeid'}, {'creatorType': 'author', 'firstName': 'Poramate', 'lastName': 'Manoonpong'}], 'abstractNote': 'Recently, drones have been involved in several critical tasks such as infrastructure inspection, crisis response, and search and rescue operations. Such drones mostly use sophisticated computer vision techniques to effectively avoid obstacles and, thereby, require high computational power. Therefore, this work tuned and tested a computationally inexpensive algorithm, previously developed by the authors, for adaptive obstacle avoidance control of a drone. The algorithm aims at protecting the drone from entering in complex situations such as deadlocks and corners. The algorithm has been validated through simulation and implemented on a newly developed drone platform for infrastructure inspection. The design of the drone platform and the experimental results are presented in this study.', 'date': '2018-08', 'proceedingsTitle': '2018 21st Euromicro Conference on Digital System Design (DSD)', 'conferenceName': '2018 21st Euromicro Conference on Digital System Design (DSD)', 'place': '', 'publisher': '', 'volume': '', 'pages': '707-710', 'series': '', 'language': '', 'DOI': '10.1109/DSD.2018.00009', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/8491889', 'accessDate': '2024-09-29T06:43:55Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Adaptive obstacle avoidance', 'type': 1}, {'tag': 'Autonomous drone system', 'type': 1}, {'tag': 'Collision avoidance', 'type': 1}, {'tag': 'Drones', 'type': 1}, {'tag': 'Implementation', 'type': 1}, {'tag': 'Laser radar', 'type': 1}, {'tag': 'Navigation', 'type': 1}, {'tag': 'Propellers', 'type': 1}, {'tag': 'Signal processing algorithms', 'type': 1}, {'tag': 'Simulation', 'type': 1}, {'tag': 'System recovery', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:43:55Z', 'dateModified': '2025-01-19T07:17:26Z'}\n",
      "{'key': 'V9VXJJF7', 'version': 3983, 'itemType': 'journalArticle', 'title': 'Decision making of autonomous vehicles in lane change scenarios: Deep reinforcement learning approaches with risk awareness', 'creators': [], 'abstractNote': 'Driving safety is the most important element that needs to be considered for autonomous vehicles (AVs). To ensure driving safety, we proposed a lane c…', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '134', 'issue': '', 'pages': '103452', 'date': '2022/01/01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en-US', 'DOI': '10.1016/j.trc.2021.103452', 'ISSN': '0968-090X', 'shortTitle': 'Decision making of autonomous vehicles in lane change scenarios', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X21004411', 'accessDate': '2024-10-27T11:32:40Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.sciencedirect.com', 'callNumber': '', 'rights': '', 'extra': 'Publisher: Pergamon', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-10-27T11:32:40Z', 'dateModified': '2025-01-19T07:17:25Z'}\n",
      "{'key': 'G2TFG8XP', 'version': 3983, 'itemType': 'conferencePaper', 'title': 'Deep Reinforcement Learning for Persistent Cruise Control in UAV-aided Data Collection', 'creators': [{'creatorType': 'author', 'firstName': 'Harrison', 'lastName': 'Kurunathan'}, {'creatorType': 'author', 'firstName': 'Kai', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Ni'}, {'creatorType': 'author', 'firstName': 'Eduardo', 'lastName': 'Tovar'}, {'creatorType': 'author', 'firstName': 'Falko', 'lastName': 'Dressler'}], 'abstractNote': 'Autonomous UAV cruising is gaining attention due to its flexible deployment in remote sensing, surveillance, and reconnaissance. A critical challenge in data collection with the autonomous UAV is the buffer overflows at the ground sensors and packet loss due to lossy airborne channels. Trajectory planning of the UAV is vital to alleviate buffer overflows as well as channel fading. In this work, we propose a Deep Deterministic Policy Gradient based Cruise Control (DDPG-CC) to reduce the overall packet loss through online training of headings and cruise velocity of the UAV, as well as the selection of the ground sensors for data collection. Preliminary performance evaluation demonstrates that DDPG-CC reduces the packet loss rate by under 5% when sufficient training is provided to the UAV.', 'date': '2021-10', 'proceedingsTitle': '2021 IEEE 46th Conference on Local Computer Networks (LCN)', 'conferenceName': '2021 IEEE 46th Conference on Local Computer Networks (LCN)', 'place': '', 'publisher': '', 'volume': '', 'pages': '347-350', 'series': '', 'language': '', 'DOI': '10.1109/LCN52139.2021.9525022', 'ISBN': '', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/document/9525022', 'accessDate': '2024-09-29T06:43:12Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'ISSN: 0742-1303', 'tags': [{'tag': 'Autonomous UAV', 'type': 1}, {'tag': 'Buffer overflows', 'type': 1}, {'tag': 'Cruise control', 'type': 1}, {'tag': 'Data collection', 'type': 1}, {'tag': 'Deep reinforcement learning', 'type': 1}, {'tag': 'Packet loss', 'type': 1}, {'tag': 'Reinforcement learning', 'type': 1}, {'tag': 'Training', 'type': 1}, {'tag': 'Trajectory planning', 'type': 1}, {'tag': 'UAV-aided WSN', 'type': 1}, {'tag': 'Wireless sensor networks', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:43:12Z', 'dateModified': '2025-01-19T07:17:25Z'}\n",
      "{'key': '6DWSNBHJ', 'version': 3981, 'itemType': 'journalArticle', 'title': 'Development of a signal-head-free intersection control logic in a fully connected and autonomous vehicle environment', 'creators': [{'creatorType': 'author', 'firstName': 'Amir', 'lastName': 'Mirheli'}, {'creatorType': 'author', 'firstName': 'Leila', 'lastName': 'Hajibabai'}, {'creatorType': 'author', 'firstName': 'Ali', 'lastName': 'Hajbabaie'}], 'abstractNote': 'Establishment of effective cooperation between vehicles and transportation infrastructure improves travel reliability in urban transportation networks. Lack of collaboration, however, exacerbates congestion due mainly to frequent stops at signalized intersections. It is beneficial to develop a control logic that collects basic safety message from approaching connected and autonomous vehicles and guarantees efficient intersection operations with safe and incident free vehicle maneuvers. In this paper, a signal-head-free intersection control logic is formulated into a dynamic programming model that aims to maximize the intersection throughput. A stochastic look-ahead technique is proposed based on Monte Carlo tree search algorithm to determine the near-optimal actions (i.e., acceleration rates) over time to prevent movement conflicts. Our numerical results confirm that the proposed technique can solve the problem efficiently and addresses the consequences of existing traffic signals. The proposed approach, while completely avoids incidents at intersections, significantly reduces travel time (ranging between 59.4% and 83.7% when compared to fixed-time and fully-actuated control strategies) at intersections under various demand patterns.', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '92', 'issue': '', 'pages': '412-425', 'date': '2018-07-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Part C: Emerging Technologies', 'language': '', 'DOI': '10.1016/j.trc.2018.04.026', 'ISSN': '0968-090X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X1830576X', 'accessDate': '2024-07-05T04:21:04Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Autonomous intersection control', 'type': 1}, {'tag': 'Connected and autonomous vehicles', 'type': 1}, {'tag': 'Control logic', 'type': 1}, {'tag': 'Dynamic programming', 'type': 1}, {'tag': 'Look-ahead model', 'type': 1}, {'tag': 'Monte Carlo tree search', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T04:21:04Z', 'dateModified': '2025-01-19T07:17:25Z'}\n",
      "{'key': 'PDJHYP8L', 'version': 3980, 'itemType': 'journalArticle', 'title': 'Decentralized optimal control of Connected Automated Vehicles at signal-free intersections including comfort-constrained turns and safety guarantees', 'creators': [{'creatorType': 'author', 'firstName': 'Yue', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Christos G.', 'lastName': 'Cassandras'}], 'abstractNote': 'We extend earlier work for optimally controlling Connected Automated Vehicles (CAVs) crossing a signal-free intersection by including all possible turns taken so as to optimize a passenger comfort metric along with energy and travel time minimization. We show that it is possible to achieve this goal in a decentralized manner with each CAV solving an optimal control problem, and derive explicit solutions that guarantee collision avoidance and safe distance constraints within a control zone. We investigate the associated tradeoffs between minimizing energy and vehicle travel time, as well as the passenger comfort metric and include extensive simulations to illustrate this framework.', 'publicationTitle': 'Automatica', 'volume': '109', 'issue': '', 'pages': '108563', 'date': '2019-11-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Automatica', 'language': '', 'DOI': '10.1016/j.automatica.2019.108563', 'ISSN': '0005-1098', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0005109819304248', 'accessDate': '2024-07-04T16:09:57Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T16:09:57Z', 'dateModified': '2025-01-19T07:17:25Z'}\n",
      "{'key': 'HURB3WYZ', 'version': 3983, 'itemType': 'preprint', 'title': 'Coordinated Lane-Changing Scheduling of Multilane Cav Platoons in Heterogeneous Scenarios', 'creators': [{'creatorType': 'author', 'firstName': 'Qingquan', 'lastName': 'Liu'}, {'creatorType': 'author', 'firstName': 'Xi', 'lastName': 'Lin'}, {'creatorType': 'author', 'firstName': 'Meng', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Fang', 'lastName': 'He'}], 'abstractNote': 'With the development of sensing, communication and automated driving technology, connected and automated vehicles (CAVs) are becoming promising solutions for future transport requirements. It is widely believed that a vehicle platoon is a good form to organize urban traffic in the CAV era. Due to the multicommodity nature of urban traffic streams, vehicles will continuously leave and join a multilane platoon, which inevitably gives rise to the need of lane changing within a multilane platoon. This paper studies the coordinated lane-changing scheduling problem in a CAV platoon, with the goal of transferring the platoon from an initial state to a target state to minimize a certain cost measurement (e.g., number of steps), while heterogeneous scenarios are considered. Two approaches, i.e., an exact and an approximate approach, are proposed. For the exact approach, we formulate an integer linear programming (ILP) model to identify the global optimal solution. Multiple objective functions are defined to meet the different needs. To relieve the computational issue of the exact approach, we further propose a tree-based heuristic search (THS), an approximate algorithm framework. THS is able to obtain an acceptable solution with negligible computational effort, and has the potential to handle the scheduling problem with more precise modeling or larger platoons. Numerical experiments are conducted to demonstrate the performance of different algorithms on both smalland large-scale cases (with up to 60 vehicles in a platoon), and the parameter combinations in the THS are tested for the optimal trade-off between solution quality and computational load. The findings indicate that ILP is practical for small- or medium-scale cases, which can generate multiple optimal solutions for different objectives; THS can solve large-scale cases in milliseconds on an ordinary personal computer, while the acquired solution is verified to be only slightly worse than the global optimum.', 'genre': 'SSRN Scholarly Paper', 'repository': 'Social Science Research Network', 'archiveID': '4154966', 'place': 'Rochester, NY', 'date': '2022-07-11', 'series': '', 'seriesNumber': '', 'DOI': '10.2139/ssrn.4154966', 'citationKey': '', 'url': 'https://papers.ssrn.com/abstract=4154966', 'accessDate': '2024-10-23T12:52:17Z', 'archive': '', 'archiveLocation': '', 'shortTitle': '', 'language': 'en', 'libraryCatalog': 'papers.ssrn.com', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'connected and automated vehicle', 'type': 1}, {'tag': 'heuristic algorithm', 'type': 1}, {'tag': 'integer linear programming', 'type': 1}, {'tag': 'lane changing', 'type': 1}, {'tag': 'multilane platoon', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-10-23T12:52:17Z', 'dateModified': '2025-01-19T07:17:24Z'}\n",
      "{'key': 'W2ECY8ER', 'version': 3981, 'itemType': 'journalArticle', 'title': 'Controllability Analysis and Optimal Control of Mixed Traffic Flow With Human-Driven and Autonomous Vehicles', 'creators': [{'creatorType': 'author', 'firstName': 'Jiawei', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Yang', 'lastName': 'Zheng'}, {'creatorType': 'author', 'firstName': 'Qing', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Jianqiang', 'lastName': 'Wang'}, {'creatorType': 'author', 'firstName': 'Keqiang', 'lastName': 'Li'}], 'abstractNote': 'Connected and automated vehicles (CAVs) have a great potential to improve traffic efficiency in mixed traffic systems, which has been demonstrated by multiple numerical simulations and field experiments. However, some fundamental properties of mixed traffic flow, including controllability and stabilizability, have not been well understood. This paper analyzes the controllability of mixed traffic systems and designs a system-level optimal control strategy. Using the Popov-Belevitch-Hautus (PBH) criterion, we prove for the first time that a ring-road mixed traffic system with one CAV and multiple heterogeneous human-driven vehicles is not completely controllable, but is stabilizable under a very mild condition. Then, we formulate the design of a system-level control strategy for the CAV as a structured optimal control problem, where the CAV’s communication ability is explicitly considered. Finally, we derive an upper bound for reachable traffic velocity via controlling the CAV. Extensive numerical experiments verify the effectiveness of our analytical results and the proposed control strategy. Our results validate the possibility of utilizing CAVs as mobile actuators to smooth traffic flow actively.', 'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems', 'volume': '22', 'issue': '12', 'pages': '7445-7459', 'date': '2021-12', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TITS.2020.3002965', 'ISSN': '1558-0016', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9127876', 'accessDate': '2024-07-05T03:38:23Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Intelligent Transportation Systems', 'tags': [{'tag': 'Autonomous vehicle', 'type': 1}, {'tag': 'Autonomous vehicles', 'type': 1}, {'tag': 'Controllability', 'type': 1}, {'tag': 'Optimal control', 'type': 1}, {'tag': 'Road traffic', 'type': 1}, {'tag': 'Stability analysis', 'type': 1}, {'tag': 'Vehicle dynamics', 'type': 1}, {'tag': 'controllability and stabilizability', 'type': 1}, {'tag': 'mixed traffic flow', 'type': 1}, {'tag': 'structured optimal control', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:38:23Z', 'dateModified': '2025-01-19T07:17:24Z'}\n",
      "{'key': '3J2M68LE', 'version': 3980, 'itemType': 'journalArticle', 'title': 'COOR-PLT: A hierarchical control model for coordinating adaptive platoons of connected and autonomous vehicles at signal-free intersections based on deep reinforcement learning', 'creators': [{'creatorType': 'author', 'firstName': 'Duowei', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Feng', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Tianyi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Yiik Diew', 'lastName': 'Wong'}, {'creatorType': 'author', 'firstName': 'Chunli', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Jianping', 'lastName': 'Wu'}], 'abstractNote': 'Platooning and coordination are two implementation strategies that are frequently proposed for traffic control of connected and autonomous vehicles (CAVs) at signal-free intersections instead of using conventional traffic signals. However, few studies have attempted to integrate both strategies to better facilitate the CAV control at signal-free intersections. To this end, this study proposes a hierarchical control model, named COOR-PLT, to coordinate adaptive CAV platoons at a signal-free intersection based on deep reinforcement learning (DRL). COOR-PLT has a two-layer framework. The first layer uses a centralized control strategy to form adaptive platoons. The optimal size of each platoon is determined by considering multiple objectives (i.e., efficiency, fairness and energy saving). The second layer employs a decentralized control strategy to coordinate multiple platoons passing through the intersection. Each platoon is labeled with coordinated status or independent status, upon which its passing priority is determined. As an efficient DRL algorithm, Deep Q-network (DQN) is adopted to determine platoon sizes and passing priorities respectively in the two layers. The model is validated and examined on the simulator Simulation of Urban Mobility (SUMO). The simulation results demonstrate that the model is able to: (1) achieve satisfactory convergence performances; (2) adaptively determine platoon size in response to varying traffic conditions; and (3) completely avoid deadlocks at the intersection. By comparison with other control methods, the model manifests its superiority of adopting adaptive platooning and DRL-based coordination strategies. Also, the model outperforms several state-of-the-art methods on reducing travel time and fuel consumption in different traffic conditions.', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '146', 'issue': '', 'pages': '103933', 'date': '2023-01-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Part C: Emerging Technologies', 'language': '', 'DOI': '10.1016/j.trc.2022.103933', 'ISSN': '0968-090X', 'shortTitle': 'COOR-PLT', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X22003461', 'accessDate': '2024-07-04T14:07:10Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Adaptive platoon', 'type': 1}, {'tag': 'Connected and autonomous vehicle (CAV)', 'type': 1}, {'tag': 'Deep reinforcement learning', 'type': 1}, {'tag': 'Hierarchical control', 'type': 1}, {'tag': 'Multi-agent coordination', 'type': 1}, {'tag': 'Signal-free intersection', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T14:07:10Z', 'dateModified': '2025-01-19T07:17:24Z'}\n",
      "{'key': 'RTIC8WUF', 'version': 3980, 'itemType': 'journalArticle', 'title': 'Comparison of Cooperative Driving Strategies for CAVs at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Huile', 'lastName': 'Xu'}, {'creatorType': 'author', 'firstName': 'Christos G.', 'lastName': 'Cassandras'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Li'}, {'creatorType': 'author', 'firstName': 'Yi', 'lastName': 'Zhang'}], 'abstractNote': 'The properties of cooperative driving strategies for planning and controlling Connected and Automated Vehicles (CAVs) at intersections range from some that achieve highly efficient coordination performance to others whose implementation is computationally fast. This paper comprehensively compares the performance of four representative strategies in terms of travel time, energy consumption, computation time, and fairness under different conditions, including the geometric configuration of intersections, asymmetry in traffic arrival rates, and the relative magnitude of these rates. Our simulation-based study has led to the following conclusions: 1) The Monte Carlo Tree Search (MCTS)-based strategy achieves the best traffic efficiency and has great performance in fuel consumption; 2) MCTS and Dynamic Resequencing (DR) strategies both perform well in all metrics of interest. If the computation budget is adequate, the MCTS strategy is recommended; otherwise, the DR strategy is preferable; 3) An asymmetric intersection has a noticeable impact on the strategies, whereas the influence of the arrival rates can be neglected. When the geometric shape is asymmetrical, the modified First-In-First-Out (FIFO) strategy significantly outperforms the FIFO strategy and works well when the traffic demand is moderate, but their performances are similar in other situations; and 4) Improving traffic efficiency sometimes comes at the cost of fairness, but the DR and MCTS strategies can be adjusted to realize a better trade-off between various performance metrics by appropriately designing their objective functions.', 'publicationTitle': 'IEEE Transactions on Intelligent Transportation Systems', 'volume': '23', 'issue': '7', 'pages': '7614-7627', 'date': '2022-07', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TITS.2021.3071456', 'ISSN': '1558-0016', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/9406435', 'accessDate': '2024-07-04T13:54:21Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Intelligent Transportation Systems', 'tags': [{'tag': 'Automation', 'type': 1}, {'tag': 'Connected and automated vehicles (CAVs)', 'type': 1}, {'tag': 'Measurement', 'type': 1}, {'tag': 'Optimal control', 'type': 1}, {'tag': 'Real-time systems', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Systems engineering and theory', 'type': 1}, {'tag': 'Vehicle dynamics', 'type': 1}, {'tag': 'cooperative driving strategy', 'type': 1}, {'tag': 'crossing sequence', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T13:54:21Z', 'dateModified': '2025-01-19T07:17:24Z'}\n",
      "{'key': 'V8KYW4IH', 'version': 3983, 'itemType': 'journalArticle', 'title': 'Adaptive Multi-Scale Fusion Blind Deblurred Generative Adversarial Network Method for Sharpening Image Data', 'creators': [{'creatorType': 'author', 'firstName': 'Baoyu', 'lastName': 'Zhu'}, {'creatorType': 'author', 'firstName': 'Qunbo', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Zheng', 'lastName': 'Tan'}], 'abstractNote': 'Drone and aerial remote sensing images are widely used, but their imaging environment is complex and prone to image blurring. Existing CNN deblurring algorithms usually use multi-scale fusion to extract features in order to make full use of aerial remote sensing blurred image information, but images with different degrees of blurring use the same weights, leading to increasing errors in the feature fusion process layer by layer. Based on the physical properties of image blurring, this paper proposes an adaptive multi-scale fusion blind deblurred generative adversarial network (AMD-GAN), which innovatively applies the degree of image blurring to guide the adjustment of the weights of multi-scale fusion, effectively suppressing the errors in the multi-scale fusion process and enhancing the interpretability of the feature layer. The research work in this paper reveals the necessity and effectiveness of a priori information on image blurring levels in image deblurring tasks. By studying and exploring the image blurring levels, the network model focuses more on the basic physical features of image blurring. Meanwhile, this paper proposes an image blurring degree description model, which can effectively represent the blurring degree of aerial remote sensing images. The comparison experiments show that the algorithm in this paper can effectively recover images with different degrees of blur, obtain high-quality images with clear texture details, outperform the comparison algorithm in both qualitative and quantitative evaluation, and can effectively improve the object detection performance of blurred aerial remote sensing images. Moreover, the average PSNR of this paper’s algorithm tested on the publicly available dataset RealBlur-R reached 41.02 dB, surpassing the latest SOTA algorithm.', 'publicationTitle': 'Drones', 'volume': '7', 'issue': '2', 'pages': '96', 'date': '2023/2', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3390/drones7020096', 'ISSN': '2504-446X', 'shortTitle': '', 'url': 'https://www.mdpi.com/2504-446X/7/2/96', 'accessDate': '2024-09-29T06:42:35Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.mdpi.com', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/by/3.0/', 'extra': 'Number: 2\\nPublisher: Multidisciplinary Digital Publishing Institute', 'tags': [{'tag': 'deep learning', 'type': 1}, {'tag': 'drone and aerial remote sensing', 'type': 1}, {'tag': 'generative adversarial networks', 'type': 1}, {'tag': 'image blur level', 'type': 1}, {'tag': 'image deblurring', 'type': 1}, {'tag': 'multi-scale', 'type': 1}, {'tag': 'object detection', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-09-29T06:42:35Z', 'dateModified': '2025-01-19T07:17:23Z'}\n",
      "{'key': 'S2PW4DZK', 'version': 3982, 'itemType': 'journalArticle', 'title': 'A Survey on Emergency Vehicle Preemption Methods Based on Routing and Scheduling', 'creators': [{'creatorType': 'author', 'firstName': 'Shridevi Jeevan', 'lastName': 'Kamble'}, {'creatorType': 'author', 'firstName': 'Manjunath R', 'lastName': 'Kounte'}], 'abstractNote': 'Emergency Vehicles (EVs) play a significant role in saving human lives and property damages. Reducing the time delay of emergency vehicles is important to enhance emergency service performance. The preemption methods are powerful strategies that assist emergency vehicles to reach the desired destination quickly by managing the competing normal traffic along the emergency vehicle approaching lane. The EV preemption models pre-clears the vehicles on the EV approaching lane by interrupting the signal timings and boosting EV arrival speed even the road traffic is high. With the assistance of preemption models, the EVs are not stopping or waiting at signalized intersections. Also, the preemption models diminish the vehicle conflict problems on the EV approaching lane. Moreover, the preemption models use different strategies to navigate the EVs on their routes efficiently. Hence, a detailed survey is needed to understand the different preemption strategies and analyze the gaps which are not effectively solved by existing literature. This paper attempts to survey the existing EV preemption methods with detailed discussions. For a clear view, the survey divides the existing preemption models into three types that are routing-based, scheduling-based, and miscellaneous. The survey compares the preemption methods with their advantages and limitations. Further, it analyzes the gaps which are not solved in existing solutions and describe the possible future directions that pave the way for innovating novel realistic preemption solutions.', 'publicationTitle': 'International Journal of Computer Networks and Applications', 'volume': '9', 'issue': '1', 'pages': '60', 'date': '2022-02-28', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'IJCNA', 'language': 'en', 'DOI': '10.22247/ijcna/2022/211623', 'ISSN': '2395-0455', 'shortTitle': '', 'url': 'http://www.i-scholar.in/index.php/IJCNA/article/view/211623', 'accessDate': '2024-08-25T04:57:46Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'DOI.org (Crossref)', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-25T04:57:46Z', 'dateModified': '2025-01-19T07:17:23Z'}\n",
      "{'key': '6VKRI58Z', 'version': 3981, 'itemType': 'journalArticle', 'title': 'A Survey of Monte Carlo Tree Search Methods', 'creators': [{'creatorType': 'author', 'firstName': 'Cameron B.', 'lastName': 'Browne'}, {'creatorType': 'author', 'firstName': 'Edward', 'lastName': 'Powley'}, {'creatorType': 'author', 'firstName': 'Daniel', 'lastName': 'Whitehouse'}, {'creatorType': 'author', 'firstName': 'Simon M.', 'lastName': 'Lucas'}, {'creatorType': 'author', 'firstName': 'Peter I.', 'lastName': 'Cowling'}, {'creatorType': 'author', 'firstName': 'Philipp', 'lastName': 'Rohlfshagen'}, {'creatorType': 'author', 'firstName': 'Stephen', 'lastName': 'Tavener'}, {'creatorType': 'author', 'firstName': 'Diego', 'lastName': 'Perez'}, {'creatorType': 'author', 'firstName': 'Spyridon', 'lastName': 'Samothrakis'}, {'creatorType': 'author', 'firstName': 'Simon', 'lastName': 'Colton'}], 'abstractNote': \"Monte Carlo tree search (MCTS) is a recently proposed search method that combines the precision of tree search with the generality of random sampling. It has received considerable interest due to its spectacular success in the difficult problem of computer Go, but has also proved beneficial in a range of other domains. This paper is a survey of the literature to date, intended to provide a snapshot of the state of the art after the first five years of MCTS research. We outline the core algorithm's derivation, impart some structure on the many variations and enhancements that have been proposed, and summarize the results from the key game and nongame domains to which MCTS methods have been applied. A number of open research questions indicate that the field is ripe for future work.\", 'publicationTitle': 'IEEE Transactions on Computational Intelligence and AI in Games', 'volume': '4', 'issue': '1', 'pages': '1-43', 'date': '2012-03', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TCIAIG.2012.2186810', 'ISSN': '1943-0698', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/6145622', 'accessDate': '2024-07-06T03:33:53Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Computational Intelligence and AI in Games', 'tags': [{'tag': 'Artificial intelligence', 'type': 1}, {'tag': 'Artificial intelligence (AI)', 'type': 1}, {'tag': 'Computers', 'type': 1}, {'tag': 'Decision theory', 'type': 1}, {'tag': 'Game theory', 'type': 1}, {'tag': 'Games', 'type': 1}, {'tag': 'Markov processes', 'type': 1}, {'tag': 'Monte Carlo methods', 'type': 1}, {'tag': 'Monte Carlo tree search (MCTS)', 'type': 1}, {'tag': 'bandit-based methods', 'type': 1}, {'tag': 'computer Go', 'type': 1}, {'tag': 'game search', 'type': 1}, {'tag': 'upper confidence bounds (UCB)', 'type': 1}, {'tag': 'upper confidence bounds for trees (UCT)', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-06T03:33:53Z', 'dateModified': '2025-01-19T07:17:23Z'}\n",
      "{'key': '8U88UBBM', 'version': 3995, 'itemType': 'journalArticle', 'title': 'A lightweight network for photovoltaic cell defect detection in electroluminescence images based on neural architecture search and knowledge distillation', 'creators': [{'creatorType': 'author', 'firstName': 'Jinxia', 'lastName': 'Zhang'}, {'creatorType': 'author', 'firstName': 'Xinyi', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Haikun', 'lastName': 'Wei'}, {'creatorType': 'author', 'firstName': 'Kanjian', 'lastName': 'Zhang'}], 'abstractNote': 'Nowadays, the rapid development of photovoltaic(PV) power stations requires increasingly reliable maintenance and fault diagnosis of PV modules in the field. Due to the effectiveness, convolutional neural network (CNN) has been widely used in the existing automatic defect detection of PV cells. However, the parameters of these CNN-based models are very large, which require stringent hardware resources and it is difficult to be applied in actual industrial projects. To solve these problems, we propose a novel lightweight high-performance model for automatic defect detection of PV cells in electroluminescence(EL) images based on neural architecture search and knowledge distillation. To auto-design an effective lightweight model, we introduce neural architecture search to the field of PV cell defect classification for the first time. Since the defect can be any size, we design a proper search structure of network to better exploit the multi-scale characteristic. To improve the overall performance of the searched lightweight model, we further transfer the knowledge learned by the existing pre-trained large-scale model based on knowledge distillation. Different kinds of knowledge are exploited and transferred, including attention information, feature information, logit information and task-oriented information. Experiments have demonstrated that the proposed model achieves the state-of-the-art performance on the public PV cell dataset of EL images under online data augmentation with accuracy of 91.74% and the parameters of 1.85M. The proposed lightweight high-performance model can be easily deployed to the end devices of the actual industrial projects and retain the accuracy.', 'publicationTitle': 'Applied Energy', 'volume': '355', 'issue': '', 'pages': '122184', 'date': '2024-02-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Applied Energy', 'language': '', 'DOI': '10.1016/j.apenergy.2023.122184', 'ISSN': '0306-2619', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0306261923015489', 'accessDate': '2025-01-18T15:08:54Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Deep learning', 'type': 1}, {'tag': 'Defect detection', 'type': 1}, {'tag': 'Electroluminescence', 'type': 1}, {'tag': 'Knowledge distillation', 'type': 1}, {'tag': 'Neural architecture search', 'type': 1}, {'tag': 'Photovoltaic cells', 'type': 1}], 'collections': ['K5CDH5FQ'], 'relations': {}, 'dateAdded': '2025-01-18T15:08:54Z', 'dateModified': '2025-01-19T07:17:22Z'}\n",
      "{'key': '7GSG5PNG', 'version': 3982, 'itemType': 'journalArticle', 'title': 'A novel intelligent traffic recovery model for emergency vehicles based on context-aware reinforcement learning', 'creators': [{'creatorType': 'author', 'firstName': 'Farzad', 'lastName': 'Kiani'}, {'creatorType': 'author', 'firstName': 'Ömer Faruk', 'lastName': 'Saraç'}], 'abstractNote': 'Management of traffic emergencies has become very popular in recent years. However, timely response to emergencies and recovering from an emergency is an important problem in itself. The strategies in the current studies merely suggest that after an emergency vehicle passes, the state should iterate to the next phase. Therefore, this paper proposes a novel approach for recovering from an emergency situation at an intersection based on real scenarios. The proposed method is a combination of context-aware and Reinforcement Learning (RL) models that predicts better alternatives for different states rather than just iterating to the next phase. In this regard, a new algorithm, named Interrupt Algorithm, is proposed to predict proper actions for recovering the emergency situation. This algorithm uses a Q-learning-based model that learns from traffic context for an emergency situation and chooses viable action from an action set. The recovery actions are categorized as max, min, and avg, respectively. Test results show that our proposed model outperforms traffic flow over than standard single choice recovering action-based approach by approximately 80%. Based on this, it may be more beneficial to choose different actions and therefore, proposed algorithm with the help of RL presents a more dynamic emergency recovery model.', 'publicationTitle': 'Information Sciences', 'volume': '619', 'issue': '', 'pages': '288-309', 'date': '2023-01-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Information Sciences', 'language': '', 'DOI': '10.1016/j.ins.2022.11.057', 'ISSN': '0020-0255', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0020025522013469', 'accessDate': '2024-08-29T06:23:05Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Emergency situation', 'type': 1}, {'tag': 'Intelligent traffic management', 'type': 1}, {'tag': 'Q-learning', 'type': 1}, {'tag': 'Reinforcement learning', 'type': 1}, {'tag': 'Traffic recovery', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-29T06:23:05Z', 'dateModified': '2025-01-19T07:17:22Z'}\n",
      "{'key': 'BPGNYDSU', 'version': 3982, 'itemType': 'journalArticle', 'title': 'A multiagent approach to autonomous intersection management', 'creators': [{'creatorType': 'author', 'firstName': 'Kurt', 'lastName': 'Dresner'}, {'creatorType': 'author', 'firstName': 'Peter', 'lastName': 'Stone'}], 'abstractNote': \"Artificial intelligence research is ushering in a new era of sophisticated, mass-market transportation technology. While computers can already fly a passenger jet better than a trained human pilot, people are still faced with the dangerous yet tedious task of driving automobiles. Intelligent Transportation Systems (ITS) is the field that focuses on integrating information technology with vehicles and transportation infrastructure to make transportation safer, cheaper, and more efficient. Recent advances in ITS point to a future in which vehicles themselves handle the vast majority of the driving task. Once autonomous vehicles become popular, autonomous interactions amongst multiple vehicles will be possible. Current methods of vehicle coordination, which are all designed to work with human drivers, will be outdated. The bottleneck for roadway efficiency will no longer be the drivers, but rather the mechanism by which those drivers' actions are coordinated. While open-road driving is a well-studied and more-or-less-solved problem, urban traffic scenarios, especially intersections, are much more challenging.We believe current methods for controlling traffic, specifically at intersections, will not be able to take advantage of the increased sensitivity and precision of autonomous vehicles as compared to human drivers. In this article, we suggest an alternative mechanism for coordinating the movement of autonomous vehicles through intersections. Drivers and intersections in this mechanism are treated as autonomous agents in a multiagent system. In this multiagent system, intersections use a new reservation-based approach built around a detailed communication protocol, which we also present. We demonstrate in simulation that our new mechanism has the potential to significantly outperform current intersection control technology--traffic lights and stop signs. Because our mechanism can emulate a traffic light or stop sign, it subsumes the most popular current methods of intersection control. This article also presents two extensions to the mechanism. The first extension allows the system to control human-driven vehicles in addition to autonomous vehicles. The second gives priority to emergency vehicles without significant cost to civilian vehicles. The mechanism, including both extensions, is implemented and tested in simulation, and we present experimental results that strongly attest to the efficacy of this approach.\", 'publicationTitle': 'J. Artif. Int. Res.', 'volume': '31', 'issue': '1', 'pages': '591–656', 'date': '三月 1, 2008', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '', 'ISSN': '1076-9757', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ACM Digital Library', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-05T07:37:11Z', 'dateModified': '2025-01-19T07:17:22Z'}\n",
      "{'key': 'ELLMB32I', 'version': 3981, 'itemType': 'journalArticle', 'title': 'A Feasibility Analysis at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Filippos N.', 'lastName': 'Tzortzoglou'}, {'creatorType': 'author', 'firstName': 'Logan E.', 'lastName': 'Beaver'}, {'creatorType': 'author', 'firstName': 'Andreas A.', 'lastName': 'Malikopoulos'}], 'abstractNote': 'In this letter, we address the problem of improving the feasible domain of the solution of a decentralized control framework for coordinating connected and automated vehicles (CAVs) at signal-free intersections. The framework provides the optimal trajectories of CAVs to cross the intersection safely without stop-and-go driving. However, when traffic volume exceeds a certain level, finding a feasible solution for a CAV may become unattainable. We use concepts of numerical interpolation to identify appropriate polynomials that can serve as alternative trajectories of the CAVs, expanding the domain of the feasible CAV trajectories. We select the alternative polynomials through an optimization problem that aims at minimizing jerk. Finally, we demonstrate the efficacy of our approach through numerical simulations.', 'publicationTitle': 'IEEE Control Systems Letters', 'volume': '', 'issue': '', 'pages': '1-1', 'date': '2024', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/LCSYS.2024.3410629', 'ISSN': '2475-1456', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/10551377', 'accessDate': '2024-07-05T03:07:07Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Control Systems Letters', 'tags': [{'tag': 'Boundary conditions', 'type': 1}, {'tag': 'Connected automated vehicles', 'type': 1}, {'tag': 'Cruise control', 'type': 1}, {'tag': 'Interpolation', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Polynomials', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Traffic flow', 'type': 1}, {'tag': 'Trajectory', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:07:07Z', 'dateModified': '2025-01-19T07:17:22Z'}\n",
      "{'key': 'PLCLTYFY', 'version': 3994, 'itemType': 'journalArticle', 'title': 'A Conflict Duration Graph-Based Coordination Method for Connected and Automated Vehicles at Signal-Free Intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Zhiyun', 'lastName': 'Deng'}, {'creatorType': 'author', 'firstName': 'Yanjun', 'lastName': 'Shi'}, {'creatorType': 'author', 'firstName': 'Qiaomei', 'lastName': 'Han'}, {'creatorType': 'author', 'firstName': 'Lingling', 'lastName': 'Lv'}, {'creatorType': 'author', 'firstName': 'Weiming', 'lastName': 'Shen'}], 'abstractNote': 'Previous studies on Connected and Automated Vehicles (CAVs) demonstrated the potential to coordinate the behaviors of multiple connected vehicles for traffic improvements. In this paper, we first propose a Conflict Duration Graph-based (CDG-based) coordination framework to resolve collisions and improve the traffic capacity of signal-free intersections. Secondly, a Speed Control-based Intersection Coordination Model (SICM) is developed to identify complex constraints in multi-vehicle collision scenarios. Thirdly, a geometric Translation-based Intersection Coordination Algorithm (TICA) is proposed to calculate the ideal location of time blocks in CDGs and then obtain the near-optimal design speed in the form of combinatorial optimization. Twelve groups of test scenarios with different traffic volumes were designed and tested on a MATLAB-based simulation platform. Simulation results showed that the proposed method can resolve all the collisions and instruct the vehicles to pass signal-free intersections collaboratively without stopping in low to medium level of congestion.', 'publicationTitle': 'Applied Sciences', 'volume': '10', 'issue': '18', 'pages': '6223', 'date': '2020/1', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en', 'DOI': '10.3390/app10186223', 'ISSN': '2076-3417', 'shortTitle': '', 'url': 'https://www.mdpi.com/2076-3417/10/18/6223', 'accessDate': '2024-07-05T01:49:56Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.mdpi.com', 'callNumber': '', 'rights': 'http://creativecommons.org/licenses/by/3.0/', 'extra': 'Number: 18\\nPublisher: Multidisciplinary Digital Publishing Institute', 'tags': [{'tag': 'connected and automated vehicles', 'type': 1}, {'tag': 'multi-vehicle collision resolution', 'type': 1}, {'tag': 'signal-free intersection', 'type': 1}, {'tag': 'traffic coordination method', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T01:49:56Z', 'dateModified': '2025-01-19T07:17:21Z'}\n",
      "{'key': 'LXGUHRZM', 'version': 3982, 'itemType': 'journalArticle', 'title': 'A deep learning framework for modelling left-turning vehicle behaviour considering diagonal-crossing motorcycle conflicts at mixed-flow intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Ruoyu', 'lastName': 'Yao'}, {'creatorType': 'author', 'firstName': 'Weiliang', 'lastName': 'Zeng'}, {'creatorType': 'author', 'firstName': 'Yihao', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Zhaoshui', 'lastName': 'He'}], 'abstractNote': 'With heterogeneous traffic agents moving at unprotected phase, severe crossing conflicts are witnessed at mixed-flow intersections, especially when left-turning vehicles are confronted with motorcycles. However, for modelling vehicle turning behaviour, potential conflicts involving diagonal-crossing motorcycles are seldom investigated in existing studies. To explore these scenes, we present a novel interaction-aware deep-learning framework. Firstly, a Long Short-Term Memory (LSTM) based network is employed to encode vehicle historical motion features. Secondly, each vehicle’s potential target lanes are identified with a probabilistic method, followed by a pooling module that extracts and summarizes intention features. Thirdly, Graph Attention Network (GAT) and a synthesized network are introduced to model vehicle-vehicle interaction and vehicle-motorcycle interaction respectively. Finally, multiple kinds of obtained features are sent to a LSTM based decoder module, where both future displacement and body orientation of vehicles are predicted. In short-time simulation experiments, average displacement error is reduced by 47.7% and 20.0% compared to baseline and state-of-the-art methods, with ablation studies conducted to quantify the efficacy of each kind of feature. Moreover, regarding recursive simulation, our model shows availability of reproducing lane-selecting and motorcycle-evasive behaviours. Distributions of post-encroachment time further indicate that the proposed framework can serve as a promising method to realize reliable motion planning for autonomous vehicles.', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '132', 'issue': '', 'pages': '103415', 'date': '2021-11-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Part C: Emerging Technologies', 'language': '', 'DOI': '10.1016/j.trc.2021.103415', 'ISSN': '0968-090X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X21004095', 'accessDate': '2024-08-09T10:14:16Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Deep learning', 'type': 1}, {'tag': 'Mixed-flow intersection', 'type': 1}, {'tag': 'Trajectory prediction', 'type': 1}, {'tag': 'Vehicle behaviour modelling', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-08-09T10:14:16Z', 'dateModified': '2025-01-19T07:17:21Z'}\n",
      "{'key': 'Q2CGPQIH', 'version': 3981, 'itemType': 'journalArticle', 'title': 'A decentralized energy-optimal control framework for connected automated vehicles at signal-free intersections', 'creators': [{'creatorType': 'author', 'firstName': 'Andreas A.', 'lastName': 'Malikopoulos'}, {'creatorType': 'author', 'firstName': 'Christos G.', 'lastName': 'Cassandras'}, {'creatorType': 'author', 'firstName': 'Yue J.', 'lastName': 'Zhang'}], 'abstractNote': 'We address the problem of optimally controlling connected and automated vehicles (CAVs) crossing an urban intersection without any explicit traffic signaling, so as to minimize energy consumption subject to a throughput maximization requirement. We show that the solution of the throughput maximization problem depends only on the hard safety constraints imposed on CAVs and its structure enables a decentralized optimal control problem formulation for energy minimization. We present a complete analytical solution of these decentralized problems and derive conditions under which feasible solutions satisfying all safety constraints always exist. The effectiveness of the proposed solution is illustrated through simulation which shows substantial dual benefits of the proposed decentralized framework by allowing CAVs to conserve momentum and fuel while also improving travel time.', 'publicationTitle': 'Automatica', 'volume': '93', 'issue': '', 'pages': '244-256', 'date': '2018-07-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Automatica', 'language': '', 'DOI': '10.1016/j.automatica.2018.03.056', 'ISSN': '0005-1098', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0005109818301511', 'accessDate': '2024-07-05T01:11:06Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Autonomous intersections', 'type': 1}, {'tag': 'Connected and automated vehicles', 'type': 1}, {'tag': 'Decentralized optimal control', 'type': 1}, {'tag': 'Energy usage', 'type': 1}, {'tag': 'Motion planning', 'type': 1}, {'tag': 'Safety', 'type': 1}, {'tag': 'Traffic flow', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T01:11:06Z', 'dateModified': '2025-01-19T07:17:21Z'}\n",
      "{'key': 'SV7ZHNIF', 'version': 3981, 'itemType': 'journalArticle', 'title': 'A consensus-based distributed trajectory control in a signal-free intersection', 'creators': [{'creatorType': 'author', 'firstName': 'Amir', 'lastName': 'Mirheli'}, {'creatorType': 'author', 'firstName': 'Mehrdad', 'lastName': 'Tajalli'}, {'creatorType': 'author', 'firstName': 'Leila', 'lastName': 'Hajibabai'}, {'creatorType': 'author', 'firstName': 'Ali', 'lastName': 'Hajbabaie'}], 'abstractNote': 'This paper develops a distributed cooperative control logic to determine conflict-free trajectories for connected and automated vehicles (CAVs) in signal-free intersections. The cooperative trajectory planning problem is formulated as vehicle-level mixed-integer non-linear programs (MINLPs) that aim to minimize travel time of each vehicle and their speed variations, while avoiding near-crash conditions. To push vehicle-level solutions towards global optimality, we develop a coordination scheme between CAVs on conflicting movements. The coordination scheme shares vehicle states (i.e., location) over a prediction horizon and incorporates such information in CAVs’ respective MINLPs. Therefore, the CAVs will reach consensus through an iterative process and select conflict-free trajectories that minimize their travel time. The numerical experiments quantify the effects of the proposed methodology on traffic safety and performance measures in an intersection. The results show that the proposed distributed coordinated framework converges to near-optimal CAV trajectories with no conflicts in the intersection neighborhood. While the solutions are found in real-time, the comparison to a central intersection control logic for CAVs indicates a maximum marginal objective value of 2.30%. Furthermore, the maximum marginal travel time, throughput, and average speed do not exceed 0.5%, 0.1%, and 0.5%, respectively. The proposed control logic reduced travel time by 43.0–70.5%, and increased throughput and average speed respectively by 0.8–115.6% and 59.1–400.0% compared to an optimized actuated signal control, while eliminating all near-crash conditions.', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '100', 'issue': '', 'pages': '161-176', 'date': '2019-03-01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': 'Transportation Research Part C: Emerging Technologies', 'language': '', 'DOI': '10.1016/j.trc.2019.01.004', 'ISSN': '0968-090X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X18311343', 'accessDate': '2024-07-04T16:53:56Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'ScienceDirect', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Connected and autonomous vehicles', 'type': 1}, {'tag': 'Control logic', 'type': 1}, {'tag': 'Cooperative', 'type': 1}, {'tag': 'Coordination', 'type': 1}, {'tag': 'Distributed algorithm', 'type': 1}, {'tag': 'Signal-free', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T16:53:57Z', 'dateModified': '2025-01-19T07:17:21Z'}\n",
      "{'key': '6UH6QDGK', 'version': 3980, 'itemType': 'journalArticle', 'title': '自动驾驶环境下交叉口车辆路径规划与最优控制模型', 'creators': [{'creatorType': 'author', 'firstName': 'Wei', 'lastName': 'Wu'}], 'abstractNote': '自动驾驶环境下的交叉口基于车车/车路之间的双向信息交互,\\xa0能保障自动驾驶车辆相互穿插与协作地通过交叉 口,\\xa0而无需信号灯控制.\\xa0因此,\\xa0如何设计高效的面向自动驾驶车辆通行的交叉口管控模型,\\xa0已成为研究的热点.\\xa0已有研究在 建模时,\\xa0均基于自动驾驶车辆在交叉口内部的行驶路径已知并作为模型输入,\\xa0且大多对交叉口内部的冲突点进行简化.\\xa0本文 首先将交叉口空间离散化处理,\\xa0考虑车辆的实际尺寸并面向非常规交叉口,\\xa0使用椭圆曲线建立转弯车辆行驶路径的精确轨 迹方程,\\xa0再通过外边界投影降维法建立轨迹方程和交叉口空间的映射关系.\\xa0建立了基于混合整数线性规划(Mixed\\xa0integer linear\\xa0programming,\\xa0MILP)的自动驾驶交叉口管控模型,\\xa0以交叉口总延误最小为控制目标,\\xa0同时优化车辆在交叉口的最 佳行驶路径和驶入时刻,\\xa0使用AMPL\\xa0(A\\xa0mathematical\\xa0programming\\xa0language)对模型进行编译并使用CPLEX求解器求 解.\\xa0与经典感应控制和先到先服务模型进行对比,\\xa0结果表明,\\xa0本文所提出模型能对车辆进入交叉口的时刻和行驶路径进行双 重优化,\\xa0显著降低自动驾驶车辆通过交叉口的车均延误,\\xa0提高交叉口空间的利用效率.', 'publicationTitle': '', 'volume': '', 'issue': '', 'pages': '', 'date': '2020-9', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '', 'ISSN': '', 'shortTitle': '', 'url': '', 'accessDate': '', 'archive': '', 'archiveLocation': '', 'libraryCatalog': '', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-04T14:03:32Z', 'dateModified': '2025-01-19T07:17:20Z'}\n",
      "{'key': 'BK7WPN7P', 'version': 3979, 'itemType': 'journalArticle', 'title': '学业预警知识图谱的构建与应用', 'creators': [{'creatorType': 'author', 'firstName': '瑾', 'lastName': '闫'}, {'creatorType': 'author', 'firstName': '爽英', 'lastName': '刘'}, {'creatorType': 'author', 'firstName': '姗', 'lastName': '白'}, {'creatorType': 'author', 'firstName': '伟艳', 'lastName': '王'}, {'creatorType': 'author', 'firstName': '丹', 'lastName': '张'}], 'abstractNote': '针对学业预警体系中“事前事中预防”措施不足、过程化与可视化较低的问题，进行了学业预警知识图谱的构建与应用。首先，通过protégé完成模式层的构建，对知识的数据结构(包含实体、关系、属性)进行了设计，采用了树状结构，使每个子类继承其祖先节点的属性；其次，以事实三元组为单位，存储具体的信息；然后，通过关系型数据库实现数据与本体的映射，对关系型数据库的结构化数据进行知识抽取，通过D2RQ工具将结构化数据转化为三元组数据，存储在SQL中；最后，使用Neo4j图数据库可视化展示，完成学业预警知识图谱的构建。公开数据集实验测试结果表明，所构建的学业预警知识图谱能够对学生学业进行预警，对实体及属性进行校验标注，经过采样标注后得到准确率为94.23%,且时效性良好，系统平均在9 ms后开始传输，并在25 ms后完成，同时在过程化与可视化方面有较大提升，可以实现“事前事中预防”。', 'publicationTitle': '中北大学学报(自然科学版)', 'volume': '44', 'issue': '3', 'pages': '256-262', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '', 'ISSN': '1673-3193', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50kyKUrPKilczQCToTb0b8hR90UZ_coCnRG5-mFTh0KJYq5nsDzG2GIu&uniplatform=NZKPT', 'accessDate': '2023-06-07T07:40:48Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Neo4j', 'type': 1}, {'tag': 'Neo4j cademic early warning', 'type': 1}, {'tag': 'data layer', 'type': 1}, {'tag': 'knowledge graph', 'type': 1}, {'tag': 'model layer', 'type': 1}, {'tag': '学业预警', 'type': 1}, {'tag': '数据层', 'type': 1}, {'tag': '模式层', 'type': 1}, {'tag': '知识图谱', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-06-07T07:40:48Z', 'dateModified': '2025-01-19T07:17:20Z'}\n",
      "{'key': 'EEV67LED', 'version': 3979, 'itemType': 'journalArticle', 'title': '《人工智能应用导论》知识图谱构建与应用研究', 'creators': [{'creatorType': 'author', 'firstName': '军', 'lastName': '张'}, {'creatorType': 'author', 'firstName': '占江', 'lastName': '苑'}, {'creatorType': 'author', 'firstName': '忠明', 'lastName': '杨'}, {'creatorType': 'author', 'firstName': '子健', 'lastName': '李'}, {'creatorType': 'author', 'firstName': '浩然', 'lastName': '刘'}, {'creatorType': 'author', 'firstName': '越', 'lastName': '邓'}, {'creatorType': 'author', 'firstName': '泽楷', 'lastName': '林'}], 'abstractNote': '随着人工智能技术引发新一轮科技革命和社会进步，高职《人工智能应用导论》课程在众多院校相继开设，对其开展知识图谱的构建应用研究逐渐兴起，然而知识图谱的构建需要领域专家共同探讨、手工构建，十分严谨而繁琐，人工智能知识更新又较为快速，目前高职《人工智能应用导论》知识图谱的构建较为缺乏。本文借助机器学习和自然语言处理技术分别对《人工智能应用导论》课程文本资源进行实体识别和关系抽取，接着进行知识融合，最后基于Neo4j图数据库可视化展示并进行知识点推理。实验结果表明：构造后的课程知识图谱涵盖了《人工智能应用导论》课程所有知识点及其关系属性，助力开展知识点学习推理和学习路径推荐研究。', 'publicationTitle': '网络安全技术与应用', 'volume': '', 'issue': '6', 'pages': '96-99', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '', 'ISSN': '1009-6833', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50ntGgPOL7QfrrIhj5ge6CBH4g2IsyPB8asXh8xKI3kodiPsmDcTH8Zw&uniplatform=NZKPT', 'accessDate': '2023-06-07T07:38:13Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': '人工智能应用导论课程', 'type': 1}, {'tag': '应用研究', 'type': 1}, {'tag': '推理实验', 'type': 1}, {'tag': '知识图谱', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-06-07T07:38:13Z', 'dateModified': '2025-01-19T07:17:19Z'}\n",
      "{'key': '5IAANHVB', 'version': 3979, 'itemType': 'journalArticle', 'title': '时态知识图谱的推理研究综述', 'creators': [{'creatorType': 'author', 'firstName': '英汉', 'lastName': '沈'}, {'creatorType': 'author', 'firstName': '旭晖', 'lastName': '江'}, {'creatorType': 'author', 'firstName': '元卓', 'lastName': '王'}, {'creatorType': 'author', 'firstName': '紫宣', 'lastName': '李'}, {'creatorType': 'author', 'firstName': '子健', 'lastName': '李'}, {'creatorType': 'author', 'firstName': '鹤翔', 'lastName': '谭'}, {'creatorType': 'author', 'firstName': '华伟', 'lastName': '沈'}], 'abstractNote': '随着社交网络、物端感知等技术快速发展,网络空间中涌现了大量的交互、话题、事件、新闻等数据,蕴含大量动态演化、强时效性的知识.较于忽略知识中时间信息的传统知识图谱,时态知识图谱通过建模知识的时效性以描述动态变化的现实世界,为时间紧耦合的应用提供有效支持.然而,时态知识图谱无法确保涵盖全量知识,知识的缺失严重影响应用性能,需要推理模型自动挖掘新的知识,以解释事物的历史状态,预测未来发展趋势并描述演化规律.由于实际应用的迫切需要,近年来,时态知识图谱的推理研究工作层出不穷,逐渐引起学术界和工业界的广泛关注.本文对近年来时态知识图谱的推理工作进行全面介绍和总结.首先,介绍了时态知识图谱的推理相关概念与问题描述;其次,介绍了面向补全任务的推理模型与面向预测任务的推理模型,对其进行比较分析;之后总结了时态知识图谱推理的数据集、推理任务、相关指标以及应用场景;最后展望时态知识图谱推理的未来研究趋势.综上,本文致力于为时态知识图谱的推理领域研究人员提供具有价值的参考,以推动该领域进一步发展.', 'publicationTitle': '计算机学报', 'volume': '46', 'issue': '6', 'pages': '1272-1301', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '', 'ISSN': '0254-4164', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50n2_FC1ezrNgNLgIgJzEpc4dzL6iDCe0g5IUw98rlIO1hnPzttTlaiP&uniplatform=NZKPT', 'accessDate': '2023-05-31T06:49:29Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'knowledge completion', 'type': 1}, {'tag': 'knowledge graph', 'type': 1}, {'tag': 'knowledge prediction', 'type': 1}, {'tag': 'temporal knowledge reasoning', 'type': 1}, {'tag': '时态知识图谱', 'type': 1}, {'tag': '时态知识推理', 'type': 1}, {'tag': '知识图谱 temporal knowledge graph', 'type': 1}, {'tag': '知识补全', 'type': 1}, {'tag': '知识预测', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-05-31T06:49:29Z', 'dateModified': '2025-01-19T07:17:19Z'}\n",
      "{'key': 'U3RT8R79', 'version': 3978, 'itemType': 'journalArticle', 'title': '基于车路协同的城市应急车辆优先控制：概述与展望', 'creators': [{'creatorType': 'author', 'firstName': '王力', 'lastName': '张立立'}, {'creatorType': 'author', 'firstName': 'Wang Li', 'lastName': 'Zhang Lili'}], 'abstractNote': '面向我国城市常态应急车辆优先通行需求和车路协同智能交通发展的实际情况,总结了应急车辆优先控制发展历程和研究现状,分析并讨论了存在的问题和未来的研究重点。首先,回顾了我国常态应急车辆优先的发展情况；其次,概述了车路协同应急车辆优先控制、结合优先与路径规划的应急车辆优先控制的国内外研究现状；最后,针对当前研究存在的问题进行了讨论并立足车路协同、自动驾驶等新理论与技术的演进总结了常态应急车辆优先领域的研究重点和应着力解决的关键问题。', 'publicationTitle': '科学技术与工程', 'volume': '21', 'issue': '34', 'pages': '14484-14491', 'date': '2021-12-16', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '科学技术与工程', 'language': 'cn', 'DOI': '', 'ISSN': '1671-1815', 'shortTitle': '', 'url': 'http://www.stae.com.cn//jsygc/article/abstract/2101973', 'accessDate': '2024-07-09T05:05:52Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.stae.com.cn', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-09T05:05:52Z', 'dateModified': '2025-01-19T07:17:18Z'}\n",
      "{'key': '2R79CFKZ', 'version': 3978, 'itemType': 'journalArticle', 'title': '基于知识图谱理念的生物医学电子学在线课程建设初步探索', 'creators': [{'creatorType': 'author', 'firstName': '华婷', 'lastName': '涂'}, {'creatorType': 'author', 'firstName': '选', 'lastName': '王'}, {'creatorType': 'author', 'firstName': '雨彤', 'lastName': '查'}, {'creatorType': 'author', 'firstName': '加勇', 'lastName': '严'}], 'abstractNote': '为满足疫情期间在线教育需求，基于知识图谱理念，从知识、问题、能力3个维度完善课程知识网络结构，初步探索生物医学电子学在线课程建设新模式，旨在重构教学资源、完善人才培养方案，实现开放、个性、精准的智慧教育体系创新。', 'publicationTitle': '卫生职业教育', 'volume': '41', 'issue': '11', 'pages': '19-23', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '10.20037/j.issn.1671-1246.2023.11.07', 'ISSN': '1671-1246', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50loVFp_9WcsUA-WIO5VPFBwqL4Z0qvbQZmVs1z40ce6xS5VghuFE8SX&uniplatform=NZKPT', 'accessDate': '2023-06-07T07:40:28Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'Biomedical electronics', 'type': 1}, {'tag': 'Online course', 'type': 1}, {'tag': '在线课程 Knowledge graph', 'type': 1}, {'tag': '生物医学电子学', 'type': 1}, {'tag': '知识图谱', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-06-07T07:40:28Z', 'dateModified': '2025-01-19T07:17:18Z'}\n",
      "{'key': 'GF9YMCPQ', 'version': 3978, 'itemType': 'journalArticle', 'title': '基于博弈论的无信号交叉口冲突消解方法', 'creators': [], 'abstractNote': '', 'publicationTitle': '重庆理工大学学报（自然科学）', 'volume': '35', 'issue': '10', 'pages': '144-151', 'date': '2021-11-09', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '重庆理工大学学报（自然科学）', 'language': 'cn', 'DOI': '', 'ISSN': '1674-8425', 'shortTitle': '', 'url': 'http://clgzk.qks.cqut.edu.cn/CN/abstract/abstract5742.shtml', 'accessDate': '2024-07-05T03:35:09Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'clgzk.qks.cqut.edu.cn', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:35:09Z', 'dateModified': '2025-01-19T07:17:17Z'}\n",
      "{'key': 'M928DEBB', 'version': 3978, 'itemType': 'journalArticle', 'title': '地理知识图谱下的建筑群空间分布模式推理', 'creators': [{'creatorType': 'author', 'firstName': '曾杨', 'lastName': '唐'}, {'creatorType': 'author', 'firstName': '廷华', 'lastName': '艾'}, {'creatorType': 'author', 'firstName': '海江', 'lastName': '徐'}], 'abstractNote': '以图结构表达的知识图谱不仅在语义网络的描述与推理中发挥着重要作用，对于空间实体的结构化抽象与空间推理也具有重要意义。空间实体的联系信息在知识图谱中以图的边记录，通过路径探测、子图对齐、模式发现等基于边的知识图谱计算推理，在空间场景认知可发挥重要作用。地理知识图谱是一种对地理概念、实体及其相互关系进行形式化描述的知识系统，既有通用知识的内涵与特点，也有地理知识特定的时空特征，能够将语义模型和时空模型联系起来，描述语义关系、空间关系和时间关系，在地理知识的表达、理解、获取与推理方面有巨大的应用潜力。现有地理知识图谱的研究工作多集中于语义方面，语义关系的抽取与表达比较丰富，可以支持进一步的地理知识语义搜索等功能；然而地理知识图谱在时空模型上的知识表达比较缺乏，现有的空间关系局限在要素之间，很少涉及空间认知中进一步的分布态势、空间格局等，地理知识图谱在空间语义知识方面有待增强。本文基于知识图谱构建原理，以建筑群地理知识图谱构建为例，实现格网型建筑物模式的识别。先将建筑物抽象成实体，表达为图的节点，基于几何邻近分析提取建筑物之间的空间邻域关系，以此构建建筑群地理知识图谱；在此基础上结合建筑物模式识别的领域知识，进一步推理构建其他的空间语义关系，完善地理知识图谱；再将建筑群场景的格网模式表达为知识图谱的规则，在知识图谱上基于NoSQL语言进行推理。结果表明，本文方法能有效提取建筑物格网模式，验证了地理知识图谱在空间推理上的作用和在领域问题研究中的良好适应性，为地理知识图谱在空间认知领域的应用提供了思路。', 'publicationTitle': '地球信息科学学报', 'volume': '25', 'issue': '6', 'pages': '1202-1214', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '', 'ISSN': '1560-8999', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50lqbdNo8z0wC2CMNOSuvHmHLVNnW2rbj7WrmcNNMwN0lMx5P0x-rTEa&uniplatform=NZKPT', 'accessDate': '2023-06-07T07:41:15Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'building cluster pattern recognition', 'type': 1}, {'tag': 'geographic entity', 'type': 1}, {'tag': 'geographic knowledge', 'type': 1}, {'tag': 'grid-pattern', 'type': 1}, {'tag': 'spatial cognition', 'type': 1}, {'tag': 'spatial reasoning', 'type': 1}, {'tag': 'spatial relationship', 'type': 1}, {'tag': '地理实体', 'type': 1}, {'tag': '地理知识', 'type': 1}, {'tag': '建筑群模式识别', 'type': 1}, {'tag': '格网模式', 'type': 1}, {'tag': '知识图谱', 'type': 1}, {'tag': '空间关系', 'type': 1}, {'tag': '空间推理', 'type': 1}, {'tag': '空间认知 knowledge graph', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-06-07T07:41:15Z', 'dateModified': '2025-01-19T07:17:17Z'}\n",
      "{'key': 'RYUIPFEV', 'version': 3977, 'itemType': 'journalArticle', 'title': '车联网环境下基于间隙优化的无信号交叉口车速控制方法', 'creators': [{'creatorType': 'author', 'firstName': '２', 'lastName': '常玉林１'}], 'abstractNote': '', 'publicationTitle': '重庆理工大学学报（自然科学）', 'volume': '35', 'issue': '3', 'pages': '10-17', 'date': '2021-04-07', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '重庆理工大学学报（自然科学）', 'language': 'cn', 'DOI': '10.3969/j.issn.1674-8425(z).2021.03.002', 'ISSN': '1674-8425', 'shortTitle': '', 'url': 'http://clgzk.qks.cqut.edu.cn/CN/abstract/abstract5478.shtml', 'accessDate': '2024-07-05T03:34:26Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'clgzk.qks.cqut.edu.cn', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:34:26Z', 'dateModified': '2025-01-19T07:17:15Z'}\n",
      "{'key': 'CBBZVAF7', 'version': 3977, 'itemType': 'journalArticle', 'title': '车路协同环境下路段掉头区域车辆协同控制', 'creators': [{'creatorType': 'author', 'firstName': '文静', 'lastName': '吴'}, {'creatorType': 'author', 'firstName': '润超', 'lastName': '陈'}, {'creatorType': 'author', 'firstName': '洪飞', 'lastName': '贾'}, {'creatorType': 'author', 'firstName': '清玉', 'lastName': '罗'}, {'creatorType': 'author', 'firstName': '迪', 'lastName': '孙'}], 'abstractNote': '在车辆驾驶安全的前提下,以车速最大为目标研究了车路协同系统(CVIS)环境下掉头区域车辆的协同控制优化方法。分别设置单车连续掉头及车队掉头两种场景进行控制策略的仿真试验。结果表明,本文方法可实现掉头区域车辆的协同控制。最后,以总延误时间、驾驶舒适性为指标评估控制效率,并得到两种控制策略相适用的车头间距的阈值。', 'publicationTitle': '吉林大学学报(工学版)', 'volume': '49', 'issue': '4', 'pages': '1100-1106', 'date': '2019', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'zh-CN', 'DOI': '10.13229/j.cnki.jdxbgxb20180441', 'ISSN': '1671-5497', 'shortTitle': '', 'url': 'https://chn.oversea.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFD&dbname=CJFDLAST2019&filename=JLGY201904010&uniplatform=OVERSEA&v=Rqj5CTy_0_P9bEcOIEZeD28QjPsgNNgfmYsLUA32D4-pvDqu8Ihih0ZRsrFT_Lbd', 'accessDate': '2024-07-17T04:14:27Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': '交通运输系统工程', 'type': 1}, {'tag': '协同控制', 'type': 1}, {'tag': '路段掉头', 'type': 1}, {'tag': '车队控制', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-17T04:14:27Z', 'dateModified': '2025-01-19T07:17:12Z'}\n",
      "{'key': 'KSLXE2BH', 'version': 3977, 'itemType': 'journalArticle', 'title': '车路协同下车队避让紧急车辆的换道引导方法', 'creators': [{'creatorType': 'author', 'firstName': '朋朋', 'lastName': '焦'}, {'creatorType': 'author', 'firstName': '紫煜', 'lastName': '杨'}, {'creatorType': 'author', 'firstName': '玮琪', 'lastName': '洪'}, {'creatorType': 'author', 'firstName': '泽昊', 'lastName': '王'}], 'abstractNote': '为保证紧急车辆更安全、高效地到达紧急事故现场,基于车路协同系统,提出车队避让紧急车辆的换道引导策略。针对目标车道无车辆、有车辆和有车队3种不同场景,分别提出确保紧急车辆快速通过的协同换道策略。通过协同换道策略引导紧急车辆前方行驶的车队和目标车道的车辆改变速度以调整车辆间距,使其满足换道安全距离,依据换道轨迹规划使车队完成换道,并提出紧急车辆发送紧急避让信号的位置方法,计算当不影响紧急车辆的速度情况下,其发送紧急避让信号时与车队尾车的最短距离。利用SUMO交通仿真软件,实现车路协同环境下3种不同场景车队避让紧急车辆的换道引导,并比较目标车道为车队的场景下,车队换道至目标车队的每个空档中(方式A)和车队换道至目标车队的同一个空档中(方式B)2种不同的换道引导策略。研究结果表明:目标车道有车队的场景下,方式B的协同换道时间更短,发送紧急信号的位置距车队尾车82 m,较方式A的87 m更近,对周围车辆影响更小,因此此场景采用方式B的协同换道策略;在目标车道无车辆、有车辆和有车队3种场景下,紧急车辆分别距车队尾车71,71,82 m时发送紧急避让信号,其可以维持期望速度,验证了最短距离与车辆速度的关系式;与未使用换道引导策略的情况相比,紧急车辆的速度提高,延误减少。', 'publicationTitle': '中国公路学报', 'volume': '34', 'issue': '7', 'pages': '95-104', 'date': '2021', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'zh-CN', 'DOI': '10.19721/j.cnki.1001-7372.2021.07.007', 'ISSN': '1001-7372', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=Xlf5kQqXAOlKYazyf-ljzuHvhrTeVEbNl3DDX63-odz7HDdASJVAzcJwZJzvVUrEbgD6GyZfk9cS9ZzyxciZ46DMfT_AYFYK8jEUH9NlxflRam0Y7FqsY2p0QywF8Fczsa-06hHpxjDFjdjWaOxBW2ubTwM6KV6y&uniplatform=NZKPT&language=CHS', 'accessDate': '2024-07-15T07:36:49Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': '交通工程', 'type': 1}, {'tag': '协同换道', 'type': 1}, {'tag': '换道引导策略', 'type': 1}, {'tag': '紧急避让', 'type': 1}, {'tag': '车路协同', 'type': 1}, {'tag': '车队编组', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-15T07:36:49Z', 'dateModified': '2025-01-19T07:17:11Z'}\n",
      "{'key': 'RS42XFPS', 'version': 3977, 'itemType': 'journalArticle', 'title': '车路协同下避让紧急车辆协同换道策略', 'creators': [{'creatorType': 'author', 'firstName': '威', 'lastName': '郝'}, {'creatorType': 'author', 'firstName': '聪', 'lastName': '梁'}, {'creatorType': 'author', 'firstName': '兆磊', 'lastName': '张'}, {'creatorType': 'author', 'firstName': '能超', 'lastName': '吕'}, {'creatorType': 'author', 'firstName': '可夫', 'lastName': '易'}], 'abstractNote': '为加快紧急车辆抵达事故现场的速度，同时减少紧急车辆优先权对其他车辆的影响，运用车路协同系统，提出避让紧急车辆协同换道策略，通过调整紧急车辆下游车辆位置，实现紧急车辆高效通过路段。以紧急车辆前车（DV）及其相邻目标车道车辆为控制对象，根据相邻车道车辆间距与车车通信范围，搜索DV可换道空间间隙集。以交通流整体恢复稳定时间最小为目标，确定DV换道轨迹和相邻车道协作车辆的速度变化，引导车辆完成协同合流，既能保障车辆安全换道，还能降低换道造成的速度振荡传递。同时，为快速恢复DV换道造成的目标车道车辆速度波动，对上游车辆（UV）采取先进先出规则的换道控制策略。所提协同避让紧急车辆的策略考虑了车辆协同换道对交通流的整体影响，并在原有换道策略的基础上提出了减少速度波动传递的控制方法。案例分析结果表明：采用上下游协同换道策略最短换道时间为6s，此时紧急车辆距前车78.66 m时发送避让信号。同时研究发现，恢复交通流速度稳定所需的时间为29 s，比未采用上下游协同换道策略降低了34%。', 'publicationTitle': '交通信息与安全', 'volume': '40', 'issue': '4', 'pages': '92-100', 'date': '2022', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'zh-CN', 'DOI': '', 'ISSN': '1674-4861', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=Xlf5kQqXAOnSvBuBHPaFIxyTinGrDINee5jkjVqvswSTlFcQsqwzOr0FFp9R8WmpH5l55BQ3Asxnuh6zFhhtBtXIZexTcE8t-TxouPdY6k2Vkk5NOC6HVs6XZrvx7C32GIX7M055z-wcXkX91S6Tv1MhuM3MkEXR&uniplatform=NZKPT&language=CHS', 'accessDate': '2024-07-15T07:34:48Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': '交通工程', 'type': 1}, {'tag': '协同换道策略', 'type': 1}, {'tag': '紧急避让', 'type': 1}, {'tag': '车路协同', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-15T07:34:48Z', 'dateModified': '2025-01-19T07:17:11Z'}\n",
      "{'key': 'LWMJUHSC', 'version': 3976, 'itemType': 'journalArticle', 'title': '高校IT运维知识图谱构建及应用', 'creators': [{'creatorType': 'author', 'firstName': '建青', 'lastName': '李'}], 'abstractNote': '针对高校IT运维中出现的效率低下和服务不佳问题，文章研究设计了基于知识图谱的IT运维总体架构。通过分析运维对象、运维服务流程、运维数据、采集分析处理、运维管理等模块，构建了IT运维问答知识图谱，实现了运维场景和孤立知识点的业务应用关联，基于Neo4j和Python技术开发了IT运维问答系统，解决了运维效率低下和服务质量不佳的问题。', 'publicationTitle': '现代信息科技', 'volume': '7', 'issue': '7', 'pages': '1-5+10', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '10.19850/j.cnki.2096-4706.2023.07.001', 'ISSN': '2096-4706', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50l8yRr-dwAdi5vhgvFP3RlDDHgKkYlPZYTXibJWzFBHw1euGu1vuenZ&uniplatform=NZKPT', 'accessDate': '2023-05-13T06:57:54Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'IT operation and maintenance', 'type': 1}, {'tag': 'IT运维', 'type': 1}, {'tag': 'Q&A system', 'type': 1}, {'tag': '知识图谱', 'type': 1}, {'tag': '问答系统 knowledge graph', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-05-13T06:57:54Z', 'dateModified': '2025-01-19T07:17:06Z'}\n",
      "{'key': 'MB2JDCWF', 'version': 3976, 'itemType': 'journalArticle', 'title': '自动车环境下交叉口无信号混合控制策略研究', 'creators': [], 'abstractNote': '针对交叉口拥挤且主路和支路流量有较大差异的自动车场景下，基于预约的先到先得（First Come First Serve， FCFS）控制策略效率低于信号配时策略的悖论，提出了一种结合FCFS和车队控制的交叉口无信号混合控制策略.在保证安全性的前提下，以延误最低为目标，优化车辆的通行顺序.引入启发式冲突协调算法和取消预约机制，根据车辆所在车道的车流量区分车辆的优先级，车辆根据实时车流量状况自适应地组成车队通过交叉口，从而减少FCFS策略产生的频繁通行权交换，保证车流量大方向车辆通行的连续性.仿真结果表明：在悖论场景下，混合控制策略较FCFS策略能减少55.84%的总延误.当交叉口总体车流量较大且主路与支路车流量差异较明显时，混合控制策略较FCFS策略在减少延误、提高交叉口通行能力方面的优势更明显.', 'publicationTitle': '北京交通大学学报', 'volume': '46', 'issue': '6', 'pages': '27-35', 'date': '2022/12/15', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'zh', 'DOI': '10.11860/j.issn.1673-0291.20210119', 'ISSN': '1673-0291', 'shortTitle': '', 'url': 'https://jdxb.bjtu.edu.cn/CN/10.11860/j.issn.1673-0291.20210119', 'accessDate': '2024-07-05T04:11:43Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'jdxb.bjtu.edu.cn', 'callNumber': '', 'rights': '', 'extra': 'Number: 6', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T04:11:43Z', 'dateModified': '2025-01-19T07:17:05Z'}\n",
      "{'key': 'GBPRBTJW', 'version': 3976, 'itemType': 'journalArticle', 'title': 'A Hierarchical Robust Control Strategy for Decentralized Signal-Free Intersection Management', 'creators': [{'creatorType': 'author', 'firstName': 'Xiao', 'lastName': 'Pan'}, {'creatorType': 'author', 'firstName': 'Boli', 'lastName': 'Chen'}, {'creatorType': 'author', 'firstName': 'Li', 'lastName': 'Dai'}, {'creatorType': 'author', 'firstName': 'Stelios', 'lastName': 'Timotheou'}, {'creatorType': 'author', 'firstName': 'Simos A.', 'lastName': 'Evangelou'}], 'abstractNote': 'The development of connected and automated vehicles (CAVs) is the key to improving urban mobility safety and efficiency. This article focuses on cooperative vehicle management at a signal-free intersection with consideration of vehicle modeling uncertainties and sensor measurement disturbances. The problem is approached by a hierarchical robust control strategy in a decentralized traffic coordination framework where optimal control and tube-based robust model predictive control methods are designed to hierarchically solve the optimal crossing order and the velocity trajectories of a group of CAVs in terms of energy consumption and throughput. To capture the energy consumption of each vehicle, their powertrain system is modeled in line with an electric drive system. With a suitable relaxation and spatial modeling approach, the optimization problems in the proposed strategy can be formulated as convex second-order cone programs, which provide a unique and computationally efficient solution. A rigorous proof of the equivalence between the convexified and the original problems is also provided. Simulation results illustrate the effectiveness and robustness of the proposed strategy and reveal the impact of traffic density on the control solution. The study of the Pareto optimal solutions for the energy–time objective shows that a minor reduction in journey time can considerably reduce energy consumption, which emphasizes the necessity of optimizing their tradeoff. Finally, the numerical comparisons carried out for different prediction horizons and sampling intervals provide insight into the control design.', 'publicationTitle': 'IEEE Transactions on Control Systems Technology', 'volume': '31', 'issue': '5', 'pages': '2011-2026', 'date': '2023-09', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TCST.2023.3291536', 'ISSN': '1558-0865', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/10186085', 'accessDate': '2024-07-05T03:22:42Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Control Systems Technology', 'tags': [{'tag': 'Computational modeling', 'type': 1}, {'tag': 'Connected and automated vehicles (CAVs)', 'type': 1}, {'tag': 'Energy consumption', 'type': 1}, {'tag': 'Mechanical power transmission', 'type': 1}, {'tag': 'Optimization', 'type': 1}, {'tag': 'Predictive models', 'type': 1}, {'tag': 'Robust control', 'type': 1}, {'tag': 'Uncertainty', 'type': 1}, {'tag': 'convex formulation', 'type': 1}, {'tag': 'cooperative vehicle management', 'type': 1}, {'tag': 'optimization', 'type': 1}, {'tag': 'tube-based robust model predictive control (MPC)', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2024-07-05T03:22:42Z', 'dateModified': '2025-01-19T07:17:05Z'}\n",
      "{'key': 'WGDUNLRR', 'version': 3976, 'itemType': 'journalArticle', 'title': '《山海经》知识图谱构建与应用研究', 'creators': [{'creatorType': 'author', 'firstName': '良兵', 'lastName': '朱'}, {'creatorType': 'author', 'firstName': '飚', 'lastName': '蒙'}], 'abstractNote': '[目的/意义]知识图谱作为数智时代一种先进的知识组织方式，能够为数字人文研究提供良好的技术支持，去洞察那些以往在文本资源中看不见的隐含联系和知识结构。[方法/过程]《山海经》是中国上古三大奇书，具有非常高的研究价值。本文引入主题图和Neo4j等技术和工具，在分析《山海经》中的主题类型、关联关系、属性信息的基础上，提出了构建《山海经》知识图谱的技术架构、数据模型、实施步骤。[结果/结论]将多源异构数据进行集成，完成了《山海经》知识图谱的构建和展示，对于将知识图谱技术应用于数字人文研究领域做了有益的探索。', 'publicationTitle': '情报探索', 'volume': '', 'issue': '5', 'pages': '92-98', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '', 'ISSN': '1005-8095', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50niO_CdaBNPNX-0uaDVZU7N5AAnulcHnorXGWxEt2gB0eRbKuKlYbR5&uniplatform=NZKPT', 'accessDate': '2023-05-31T06:05:43Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'association', 'type': 1}, {'tag': 'digital humantiy', 'type': 1}, {'tag': 'knowledge graph', 'type': 1}, {'tag': 'semantic', 'type': 1}, {'tag': 'topic map', 'type': 1}, {'tag': '主题图', 'type': 1}, {'tag': '关联', 'type': 1}, {'tag': '山海经', 'type': 1}, {'tag': '数字人文', 'type': 1}, {'tag': '知识图谱', 'type': 1}, {'tag': '语义 The Classic of Mountains and Seas', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-05-31T06:05:43Z', 'dateModified': '2025-01-19T07:17:05Z'}\n",
      "{'key': 'X3KQ5EJH', 'version': 3976, 'itemType': 'journalArticle', 'title': '基于知识图谱的海洋数值预报数据推荐算法', 'creators': [{'creatorType': 'author', 'firstName': '忠伟', 'lastName': '李'}, {'creatorType': 'author', 'firstName': '东', 'lastName': '高'}, {'creatorType': 'author', 'firstName': '昕', 'lastName': '刘'}, {'creatorType': 'author', 'firstName': '金燠', 'lastName': '吴'}], 'abstractNote': '为解决海洋数值预报研究人员面对复杂多样的研究任务时难以及时准确地从种类繁多的海洋数值预报数据中找到所需数据的问题，提出基于知识图谱的海洋数值预报数据推荐算法。利用海洋数值预报文献提取研究任务及海洋数值预报数据构建知识图谱，基于知识图谱计算海洋数值预报数据实体之间的相似度，同时融合在研究人员用户行为下海洋数值预报数据的相似度，进行排序选取相似度较高的海洋数值预报数据进行推荐。实验结果表明，推荐精确率及召回率分别为67.14%、62.49%。', 'publicationTitle': '计算机工程与设计', 'volume': '44', 'issue': '5', 'pages': '1385-1391', 'date': '2023', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '中文;', 'DOI': '10.16208/j.issn1000-7024.2023.05.014', 'ISSN': '1000-7024', 'shortTitle': '', 'url': 'https://kns.cnki.net/kcms2/article/abstract?v=3uoqIhG8C44YLTlOAiTRKu87-SJxoEJu6LL9TJzd50nW0ftToGaH65dirvJKW0nseKn702sm2gcw_U7v3Bi-WOR-4T8vghXQ&uniplatform=NZKPT', 'accessDate': '2023-05-31T03:41:15Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'CNKI', 'callNumber': '', 'rights': '', 'extra': '', 'tags': [{'tag': 'data recommendation', 'type': 1}, {'tag': 'knowledge graph', 'type': 1}, {'tag': 'marine numerical forecasting literature', 'type': 1}, {'tag': 'research task', 'type': 1}, {'tag': 'similarity', 'type': 1}, {'tag': 'user behavior', 'type': 1}, {'tag': '数据推荐 marine numerical forecast', 'type': 1}, {'tag': '海洋数值预报', 'type': 1}, {'tag': '海洋数值预报文献', 'type': 1}, {'tag': '用户行为', 'type': 1}, {'tag': '相似度', 'type': 1}, {'tag': '知识图谱', 'type': 1}, {'tag': '研究任务', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2023-05-31T03:41:15Z', 'dateModified': '2025-01-19T07:17:05Z'}\n",
      "{'key': 'Q6JE6NQ5', 'version': 3976, 'itemType': 'journalArticle', 'title': 'Coordinated lane-changing scheduling of multilane CAV platoons in heterogeneous scenarios', 'creators': [], 'abstractNote': 'With the development of sensing, communication and automated driving technology, connected and automated vehicles (CAVs) are becoming promising soluti…', 'publicationTitle': 'Transportation Research Part C: Emerging Technologies', 'volume': '147', 'issue': '', 'pages': '103992', 'date': '2023/02/01', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': 'en-US', 'DOI': '10.1016/j.trc.2022.103992', 'ISSN': '0968-090X', 'shortTitle': '', 'url': 'https://www.sciencedirect.com/science/article/pii/S0968090X22004053', 'accessDate': '2024-10-23T12:50:05Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'www.sciencedirect.com', 'callNumber': '', 'rights': '', 'extra': 'Publisher: Pergamon', 'tags': [], 'collections': [], 'relations': {}, 'dateAdded': '2024-10-23T12:50:05Z', 'dateModified': '2025-01-19T07:17:03Z'}\n",
      "{'key': 'XY73UEQC', 'version': 3970, 'itemType': 'journalArticle', 'title': 'Intelligent PV Module for Grid-Connected PV Systems', 'creators': [{'creatorType': 'author', 'firstName': 'E.', 'lastName': 'Roman'}, {'creatorType': 'author', 'firstName': 'R.', 'lastName': 'Alonso'}, {'creatorType': 'author', 'firstName': 'P.', 'lastName': 'Ibanez'}, {'creatorType': 'author', 'firstName': 'S.', 'lastName': 'Elorduizapatarietxe'}, {'creatorType': 'author', 'firstName': 'D.', 'lastName': 'Goitia'}], 'abstractNote': 'Most issues carried out about building integrated photovoltaic (PV) system performance show average losses of about 20%-25% in electricity production. The causes are varied, e.g., mismatching losses, partial shadows, variations in current-voltage (I-V) characteristics of PV modules due to manufacturing processes, differences in the orientations and inclinations of solar surfaces, and temperature effects. These losses can be decreased by means of suitable electronics. This paper presents the intelligent PV module concept, a low-cost high-efficiency dc-dc converter with maximum power point tracking (MPPT) functions, control, and power line communications (PLC). In addition, this paper analyses the alternatives for the architecture of grid-connected PV systems: centralized, string, and modular topologies. The proposed system, i.e., the intelligent PV module, fits within this last group. Its principles of operation, as well as the topology of boost dc-dc converter, are analyzed. Besides, a comparison of MPPT methods is performed, which shows the best results for the incremental conductance method. Regarding communications, PLC in every PV module and its feasibility for grid-connected PV plants are considered and analyzed in this paper. After developing an intelligent PV module (with dc-dc converter) prototype, its optimal performance has been experimentally confirmed by means of the PV system test platform. This paper describes this powerful tool especially designed to evaluate all kinds of PV systems', 'publicationTitle': 'IEEE Transactions on Industrial Electronics', 'volume': '53', 'issue': '4', 'pages': '1066-1073', 'date': '2006-06', 'series': '', 'seriesTitle': '', 'seriesText': '', 'journalAbbreviation': '', 'language': '', 'DOI': '10.1109/TIE.2006.878327', 'ISSN': '1557-9948', 'shortTitle': '', 'url': 'https://ieeexplore.ieee.org/abstract/document/1667904', 'accessDate': '2025-01-19T07:15:50Z', 'archive': '', 'archiveLocation': '', 'libraryCatalog': 'IEEE Xplore', 'callNumber': '', 'rights': '', 'extra': 'Conference Name: IEEE Transactions on Industrial Electronics', 'tags': [{'tag': 'Building integrated photovoltaics', 'type': 1}, {'tag': 'Circuit topology', 'type': 1}, {'tag': 'Communication system control', 'type': 1}, {'tag': 'Communication system fault diagnosis', 'type': 1}, {'tag': 'DC-DC power converters', 'type': 1}, {'tag': 'Manufacturing processes', 'type': 1}, {'tag': 'Power line communications', 'type': 1}, {'tag': 'Production systems', 'type': 1}, {'tag': 'Programmable control', 'type': 1}, {'tag': 'System performance', 'type': 1}, {'tag': 'Temperature', 'type': 1}, {'tag': 'dc–dc power conversion', 'type': 1}, {'tag': 'frequency-shift keying (FSK)', 'type': 1}, {'tag': 'photovoltaic (PV) power systems', 'type': 1}, {'tag': 'pulsewidth-modulated (PWM) power converters', 'type': 1}], 'collections': [], 'relations': {}, 'dateAdded': '2025-01-19T07:15:50Z', 'dateModified': '2025-01-19T07:15:50Z'}\n"
     ]
    }
   ],
   "source": [
    "# 要保存的数据列表\n",
    "all_data = []\n",
    "for item in all_items:\n",
    "    # 只提取会议文章，也就是论文的pdf\n",
    "    if 'itemType' in item['data'] and (item['data']['itemType'] == 'journalArticle' or \n",
    "            item['data']['itemType'] == 'preprint' or item['data']['itemType'] == 'conferencePaper'):\n",
    "        tags = []\n",
    "        for tag in item['data']['tags']:\n",
    "            tags.append(tag['tag'])\n",
    "        if(item['data']['creators']):\n",
    "            creator = item['data']['creators'][0]['lastName'] + item['data']['creators'][0]['firstName']\n",
    "        else:\n",
    "            creator = 'Unknown Author'\n",
    "        print(item['data'])\n",
    "        if 'publicationTitle' in item['data']:\n",
    "            publicationTitle = item['data']['publicationTitle']\n",
    "        elif 'conferenceName' in item['data']:\n",
    "            publicationTitle = item['data']['conferenceName']\n",
    "        else:\n",
    "            publicationTitle = 'preprint'\n",
    "        all_data.append({\n",
    "            'title': item['data']['title'],\n",
    "            'creators': creator,\n",
    "            'abstractNote': item['data']['abstractNote'].replace(',', '，'),\n",
    "            'publicationTitle': publicationTitle,\n",
    "            'date': item['data']['date'],\n",
    "            'language': item['data']['language'],\n",
    "            'url': item['data']['url'],\n",
    "            'libraryCatalog': item['data']['libraryCatalog'],\n",
    "            'tags': tags\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "ea7ca10a-6742-4b9f-9bf1-8c5e1968a7b5",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-25T17:06:37.999337Z",
     "start_time": "2025-07-25T17:06:37.852852Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV 文件已生成: ../dataset/thesis_item_0725.csv\n"
     ]
    }
   ],
   "source": [
    "# 指定要保存的文件名\n",
    "filename = '../dataset/thesis_item_0725.csv'\n",
    "\n",
    "# 打开文件并写入数据\n",
    "with open(filename, 'w', newline='', encoding='utf-8') as csvfile:\n",
    "    writer = csv.DictWriter(csvfile, fieldnames=fields)\n",
    "    # 写入表头\n",
    "    writer.writeheader()\n",
    "    # 写入数据\n",
    "    writer.writerows(all_data)\n",
    "\n",
    "print('CSV 文件已生成:', filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c27ece74-f2e2-4ebe-bba9-318bd663fdc6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
